diff --git a/modules/imgproc/src/imgwarp.avx2.cpp b/modules/imgproc/src/imgwarp.avx2.cpp
index 63ef2ae52d..6a795a7c92 100644
--- a/modules/imgproc/src/imgwarp.avx2.cpp
+++ b/modules/imgproc/src/imgwarp.avx2.cpp
@@ -55,207 +55,6 @@ namespace cv
 namespace opt_AVX2
 {
 
-class resizeNNInvokerAVX4 :
-    public ParallelLoopBody
-{
-public:
-    resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-        ify(_ify)
-    {
-    }
-
-#if defined(__INTEL_COMPILER)
-#pragma optimization_parameter target_arch=AVX
-#endif
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int y, x;
-        int width = dsize.width;
-        int avxWidth = width - (width & 0x7);
-        const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
-        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
-        {
-            for(y = range.start; y < range.end; y++)
-            {
-                uchar* D = dst.data + dst.step*y;
-                uchar* Dstart = D;
-                int sy = std::min(cvFloor(y*ify), ssize.height-1);
-                const uchar* S = src.data + sy*src.step;
-#ifdef CV_ICC
-#pragma unroll(4)
-#endif
-                for(x = 0; x < avxWidth; x += 8)
-                {
-                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
-                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
-                    __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
-                    _mm256_maskstore_epi32((int*)D, mask, pixels);
-                    D += 32;
-                }
-                for(; x < width; x++)
-                {
-                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
-                }
-            }
-        }
-        else
-        {
-            for(y = range.start; y < range.end; y++)
-            {
-                uchar* D = dst.data + dst.step*y;
-                uchar* Dstart = D;
-                int sy = std::min(cvFloor(y*ify), ssize.height-1);
-                const uchar* S = src.data + sy*src.step;
-#ifdef CV_ICC
-#pragma unroll(4)
-#endif
-                for(x = 0; x < avxWidth; x += 8)
-                {
-                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
-                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
-                    __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
-                    _mm256_storeu_si256((__m256i*)D, pixels);
-                    D += 32;
-                }
-                for(; x < width; x++)
-                {
-                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
-                }
-            }
-        }
-        _mm256_zeroupper();
-    }
-
-private:
-    const Mat src;
-    Mat dst;
-    int* x_ofs, pix_size4;
-    double ify;
-
-    resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
-    resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
-};
-
-class resizeNNInvokerAVX2 :
-    public ParallelLoopBody
-{
-public:
-    resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-        ify(_ify)
-    {
-    }
-
-#if defined(__INTEL_COMPILER)
-#pragma optimization_parameter target_arch=AVX
-#endif
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int y, x;
-        int width = dsize.width;
-        //int avxWidth = (width - 1) - ((width - 1) & 0x7);
-        int avxWidth = width - (width & 0xf);
-        const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
-        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
-                                                                         15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
-        const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
-        //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
-        //                                                                       13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
-        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
-        {
-            for(y = range.start; y < range.end; y++)
-            {
-                uchar* D = dst.data + dst.step*y;
-                uchar* Dstart = D;
-                int sy = std::min(cvFloor(y*ify), ssize.height-1);
-                const uchar* S = src.data + sy*src.step;
-                const uchar* S2 = S - 2;
-#ifdef CV_ICC
-#pragma unroll(4)
-#endif
-                for(x = 0; x < avxWidth; x += 16)
-                {
-                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
-                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
-                    __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
-                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
-                    __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
-                    __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
-                    __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
-
-                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
-                    __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
-                    _mm256_maskstore_epi32((int*)D, mask, ints_permuted);
-                    D += 32;
-                }
-                for(; x < width; x++)
-                {
-                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
-                }
-
-            }
-        }
-        else
-        {
-            for(y = range.start; y < range.end; y++)
-            {
-                uchar* D = dst.data + dst.step*y;
-                uchar* Dstart = D;
-                int sy = std::min(cvFloor(y*ify), ssize.height-1);
-                const uchar* S = src.data + sy*src.step;
-                const uchar* S2 = S - 2;
-#ifdef CV_ICC
-#pragma unroll(4)
-#endif
-                for(x = 0; x < avxWidth; x += 16)
-                {
-                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
-                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
-                    __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
-                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
-                    __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
-                    __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
-                    __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
-
-                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
-                    __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
-                    _mm256_storeu_si256((__m256i*)D, ints_permuted);
-                    D += 32;
-                }
-                for(; x < width; x++)
-                {
-                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
-                }
-            }
-        }
-        _mm256_zeroupper();
-    }
-
-private:
-    const Mat src;
-    Mat dst;
-    int* x_ofs, pix_size4;
-    double ify;
-
-    resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
-    resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
-};
-
-void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
-{
-    resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
-}
-
-void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
-{
-    resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
-}
-
 int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
 {
     const int AB_BITS = MAX(10, (int)INTER_BITS);
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 69ccbac527..8c854a41ea 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -122,3408 +122,208 @@ static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSiz
 
 /************** interpolation formulas and tables ***************/
 
-const int INTER_RESIZE_COEF_BITS=11;
-const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
-
 const int INTER_REMAP_COEF_BITS=15;
-const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
-
-static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
-
-static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
-static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
-
-#if CV_SSE2 || CV_NEON
-static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
-static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
-#endif
-
-static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
-static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
-
-static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
-static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
-
-static inline void interpolateLinear( float x, float* coeffs )
-{
-    coeffs[0] = 1.f - x;
-    coeffs[1] = x;
-}
-
-static inline void interpolateCubic( float x, float* coeffs )
-{
-    const float A = -0.75f;
-
-    coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
-    coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
-    coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
-    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
-}
-
-static inline void interpolateLanczos4( float x, float* coeffs )
-{
-    static const double s45 = 0.70710678118654752440084436210485;
-    static const double cs[][2]=
-    {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
-
-    if( x < FLT_EPSILON )
-    {
-        for( int i = 0; i < 8; i++ )
-            coeffs[i] = 0;
-        coeffs[3] = 1;
-        return;
-    }
-
-    float sum = 0;
-    double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
-    for(int i = 0; i < 8; i++ )
-    {
-        double y = -(x+3-i)*CV_PI*0.25;
-        coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
-        sum += coeffs[i];
-    }
-
-    sum = 1.f/sum;
-    for(int i = 0; i < 8; i++ )
-        coeffs[i] *= sum;
-}
-
-static void initInterTab1D(int method, float* tab, int tabsz)
-{
-    float scale = 1.f/tabsz;
-    if( method == INTER_LINEAR )
-    {
-        for( int i = 0; i < tabsz; i++, tab += 2 )
-            interpolateLinear( i*scale, tab );
-    }
-    else if( method == INTER_CUBIC )
-    {
-        for( int i = 0; i < tabsz; i++, tab += 4 )
-            interpolateCubic( i*scale, tab );
-    }
-    else if( method == INTER_LANCZOS4 )
-    {
-        for( int i = 0; i < tabsz; i++, tab += 8 )
-            interpolateLanczos4( i*scale, tab );
-    }
-    else
-        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
-}
-
-
-static const void* initInterTab2D( int method, bool fixpt )
-{
-    static bool inittab[INTER_MAX+1] = {false};
-    float* tab = 0;
-    short* itab = 0;
-    int ksize = 0;
-    if( method == INTER_LINEAR )
-        tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
-    else if( method == INTER_CUBIC )
-        tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
-    else if( method == INTER_LANCZOS4 )
-        tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
-    else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
-
-    if( !inittab[method] )
-    {
-        AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
-        int i, j, k1, k2;
-        initInterTab1D(method, _tab, INTER_TAB_SIZE);
-        for( i = 0; i < INTER_TAB_SIZE; i++ )
-            for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
-            {
-                int isum = 0;
-                NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
-                NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
-
-                for( k1 = 0; k1 < ksize; k1++ )
-                {
-                    float vy = _tab[i*ksize + k1];
-                    for( k2 = 0; k2 < ksize; k2++ )
-                    {
-                        float v = vy*_tab[j*ksize + k2];
-                        tab[k1*ksize + k2] = v;
-                        isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
-                    }
-                }
-
-                if( isum != INTER_REMAP_COEF_SCALE )
-                {
-                    int diff = isum - INTER_REMAP_COEF_SCALE;
-                    int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
-                    for( k1 = ksize2; k1 < ksize2+2; k1++ )
-                        for( k2 = ksize2; k2 < ksize2+2; k2++ )
-                        {
-                            if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
-                                mk1 = k1, mk2 = k2;
-                            else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
-                                Mk1 = k1, Mk2 = k2;
-                        }
-                    if( diff < 0 )
-                        itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
-                    else
-                        itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
-                }
-            }
-        tab -= INTER_TAB_SIZE2*ksize*ksize;
-        itab -= INTER_TAB_SIZE2*ksize*ksize;
-#if CV_SSE2 || CV_NEON
-        if( method == INTER_LINEAR )
-        {
-            for( i = 0; i < INTER_TAB_SIZE2; i++ )
-                for( j = 0; j < 4; j++ )
-                {
-                    BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
-                    BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
-                    BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
-                    BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
-                }
-        }
-#endif
-        inittab[method] = true;
-    }
-    return fixpt ? (const void*)itab : (const void*)tab;
-}
-
-#ifndef __MINGW32__
-static bool initAllInterTab2D()
-{
-    return  initInterTab2D( INTER_LINEAR, false ) &&
-            initInterTab2D( INTER_LINEAR, true ) &&
-            initInterTab2D( INTER_CUBIC, false ) &&
-            initInterTab2D( INTER_CUBIC, true ) &&
-            initInterTab2D( INTER_LANCZOS4, false ) &&
-            initInterTab2D( INTER_LANCZOS4, true );
-}
-
-static volatile bool doInitAllInterTab2D = initAllInterTab2D();
-#endif
-
-template<typename ST, typename DT> struct Cast
-{
-    typedef ST type1;
-    typedef DT rtype;
-
-    DT operator()(ST val) const { return saturate_cast<DT>(val); }
-};
-
-template<typename ST, typename DT, int bits> struct FixedPtCast
-{
-    typedef ST type1;
-    typedef DT rtype;
-    enum { SHIFT = bits, DELTA = 1 << (bits-1) };
-
-    DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
-};
-
-/****************************************************************************************\
-*                                         Resize                                         *
-\****************************************************************************************/
-
-class resizeNNInvoker :
-    public ParallelLoopBody
-{
-public:
-    resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-        ify(_ify)
-    {
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int y, x, pix_size = (int)src.elemSize();
-
-        for( y = range.start; y < range.end; y++ )
-        {
-            uchar* D = dst.data + dst.step*y;
-            int sy = std::min(cvFloor(y*ify), ssize.height-1);
-            const uchar* S = src.ptr(sy);
-
-            switch( pix_size )
-            {
-            case 1:
-                for( x = 0; x <= dsize.width - 2; x += 2 )
-                {
-                    uchar t0 = S[x_ofs[x]];
-                    uchar t1 = S[x_ofs[x+1]];
-                    D[x] = t0;
-                    D[x+1] = t1;
-                }
-
-                for( ; x < dsize.width; x++ )
-                    D[x] = S[x_ofs[x]];
-                break;
-            case 2:
-                for( x = 0; x < dsize.width; x++ )
-                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
-                break;
-            case 3:
-                for( x = 0; x < dsize.width; x++, D += 3 )
-                {
-                    const uchar* _tS = S + x_ofs[x];
-                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
-                }
-                break;
-            case 4:
-                for( x = 0; x < dsize.width; x++ )
-                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
-                break;
-            case 6:
-                for( x = 0; x < dsize.width; x++, D += 6 )
-                {
-                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
-                    ushort* _tD = (ushort*)D;
-                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
-                }
-                break;
-            case 8:
-                for( x = 0; x < dsize.width; x++, D += 8 )
-                {
-                    const int* _tS = (const int*)(S + x_ofs[x]);
-                    int* _tD = (int*)D;
-                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
-                }
-                break;
-            case 12:
-                for( x = 0; x < dsize.width; x++, D += 12 )
-                {
-                    const int* _tS = (const int*)(S + x_ofs[x]);
-                    int* _tD = (int*)D;
-                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
-                }
-                break;
-            default:
-                for( x = 0; x < dsize.width; x++, D += pix_size )
-                {
-                    const int* _tS = (const int*)(S + x_ofs[x]);
-                    int* _tD = (int*)D;
-                    for( int k = 0; k < pix_size4; k++ )
-                        _tD[k] = _tS[k];
-                }
-            }
-        }
-    }
-
-private:
-    const Mat src;
-    Mat dst;
-    int* x_ofs, pix_size4;
-    double ify;
-
-    resizeNNInvoker(const resizeNNInvoker&);
-    resizeNNInvoker& operator=(const resizeNNInvoker&);
-};
-
-static void
-resizeNN( const Mat& src, Mat& dst, double fx, double fy )
-{
-    Size ssize = src.size(), dsize = dst.size();
-    AutoBuffer<int> _x_ofs(dsize.width);
-    int* x_ofs = _x_ofs;
-    int pix_size = (int)src.elemSize();
-    int pix_size4 = (int)(pix_size / sizeof(int));
-    double ifx = 1./fx, ify = 1./fy;
-    int x;
-
-    for( x = 0; x < dsize.width; x++ )
-    {
-        int sx = cvFloor(x*ifx);
-        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
-    }
-
-    Range range(0, dsize.height);
-#if CV_TRY_AVX2
-    if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4)))
-    {
-        if(pix_size == 2)
-            opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify);
-        else
-            opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify);
-    }
-    else
-#endif
-#if CV_TRY_SSE4_1
-    if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4)))
-    {
-        if(pix_size == 2)
-            opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
-        else
-            opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
-    }
-    else
-#endif
-    {
-        resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
-        parallel_for_(range, invoker, dst.total()/(double)(1<<16));
-    }
-}
-
-
-struct VResizeNoVec
-{
-    int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
-};
-
-struct HResizeNoVec
-{
-    int operator()(const uchar**, uchar**, int, const int*,
-        const uchar*, int, int, int, int, int) const { return 0; }
-};
-
-#if CV_SSE2
-
-struct VResizeLinearVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const int** src = (const int**)_src;
-        const short* beta = (const short*)_beta;
-        const int *S0 = src[0], *S1 = src[1];
-        int x = 0;
-        __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
-        __m128i delta = _mm_set1_epi16(2);
-
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128i x0, x1, x2, y0, y1, y2;
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
-        else
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128i x0, x1, x2, y0, y1, y2;
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
-
-        for( ; x < width - 4; x += 4 )
-        {
-            __m128i x0, y0;
-            x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
-            y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
-            x0 = _mm_packs_epi32(x0, x0);
-            y0 = _mm_packs_epi32(y0, y0);
-            x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
-            x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-            x0 = _mm_packus_epi16(x0, x0);
-            *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
-        }
-
-        return x;
-    }
-};
-
-
-template<int shiftval> struct VResizeLinearVec_32f16
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128 x0, x1, y0, y1;
-                __m128i t0, t1, t2;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-
-                x0 = _mm_load_ps(S0 + x + 8);
-                x1 = _mm_load_ps(S0 + x + 12);
-                y0 = _mm_load_ps(S1 + x + 8);
-                y1 = _mm_load_ps(S1 + x + 12);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
-        else
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128 x0, x1, y0, y1;
-                __m128i t0, t1, t2;
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-
-                x0 = _mm_loadu_ps(S0 + x + 8);
-                x1 = _mm_loadu_ps(S0 + x + 12);
-                y0 = _mm_loadu_ps(S1 + x + 8);
-                y1 = _mm_loadu_ps(S1 + x + 12);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
-
-        for( ; x < width - 4; x += 4 )
-        {
-            __m128 x0, y0;
-            __m128i t0;
-            x0 = _mm_loadu_ps(S0 + x);
-            y0 = _mm_loadu_ps(S1 + x);
-
-            x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
-            _mm_storel_epi64( (__m128i*)(dst + x), t0);
-        }
-
-        return x;
-    }
-};
-
-typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
-typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
-
-struct VResizeLinearVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        float* dst = (float*)_dst;
-        int x = 0;
-
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
-
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-        else
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-
-        return x;
-    }
-};
-
-
-struct VResizeCubicVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const int** src = (const int**)_src;
-        const short* beta = (const short*)_beta;
-        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        int x = 0;
-        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
-        __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
-            b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
-
-        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128i x0, x1, y0, y1;
-                __m128 s0, s1, f0, f1;
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_load_si128((const __m128i*)(S2 + x));
-                x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S3 + x));
-                y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
-
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
-        else
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128i x0, x1, y0, y1;
-                __m128 s0, s1, f0, f1;
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
-
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
-
-        return x;
-    }
-};
-
-
-template<int shiftval> struct VResizeCubicVec_32f16
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 x0, x1, y0, y1, s0, s1;
-            __m128i t0, t1;
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
-            t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
-
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
-            _mm_storeu_si128( (__m128i*)(dst + x), t0);
-        }
-
-        return x;
-    }
-};
-
-typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
-typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
-
-struct VResizeCubicVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        float* dst = (float*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 x0, x1, y0, y1, s0, s1;
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            _mm_storeu_ps( dst + x, s0);
-            _mm_storeu_ps( dst + x + 4, s1);
-        }
-
-        return x;
-    }
-};
-
-#if CV_TRY_SSE4_1
-
-struct VResizeLanczos4Vec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
-        else return 0;
-    }
-};
-
-#else
-
-typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
-
-#endif
-
-struct VResizeLanczos4Vec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        short * dst = (short*)_dst;
-        int x = 0;
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-
-            __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
-
-            __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
-            __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
-        }
-
-        return x;
-    }
-};
-
-
-struct VResizeLanczos4Vec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        float* dst = (float*)_dst;
-        int x = 0;
-
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-
-            _mm_storeu_ps(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
-
-
-#elif CV_NEON
-
-struct VResizeLinearVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
-        const short* beta = (const short*)_beta;
-        int x = 0;
-        int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
-
-        for( ; x <= width - 16; x += 16)
-        {
-            int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
-            int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
-
-            int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-
-            int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
-
-            v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
-            v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
-            v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
-            v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
-
-            v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-
-            int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
-
-            vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLinearVec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLinearVec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        short* dst = (short*)_dst;
-        int x = 0;
-
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLinearVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        float* dst = (float*)_dst;
-        int x = 0;
-
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
-        }
-
-        return x;
-    }
-};
-
-typedef VResizeNoVec VResizeCubicVec_32s8u;
-
-struct VResizeCubicVec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeCubicVec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        short* dst = (short*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeCubicVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        float* dst = (float*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                       v_b1, vld1q_f32(S1 + x)),
-                                                                       v_b2, vld1q_f32(S2 + x)),
-                                                                       v_b3, vld1q_f32(S3 + x)));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                          v_b1, vld1q_f32(S1 + x + 4)),
-                                                                          v_b2, vld1q_f32(S2 + x + 4)),
-                                                                          v_b3, vld1q_f32(S3 + x + 4)));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLanczos4Vec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        ushort * dst = (ushort*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
-
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
-                                                             v_b3, vld1q_f32(S3 + x + 4));
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLanczos4Vec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        short * dst = (short*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
-
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
-                                                             v_b3, vld1q_f32(S3 + x + 4));
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
-
-        return x;
-    }
-};
-
-struct VResizeLanczos4Vec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        float* dst = (float*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
-        }
-
-        return x;
-    }
-};
-
-#else
-
-typedef VResizeNoVec VResizeLinearVec_32s8u;
-typedef VResizeNoVec VResizeLinearVec_32f16u;
-typedef VResizeNoVec VResizeLinearVec_32f16s;
-typedef VResizeNoVec VResizeLinearVec_32f;
-
-typedef VResizeNoVec VResizeCubicVec_32s8u;
-typedef VResizeNoVec VResizeCubicVec_32f16u;
-typedef VResizeNoVec VResizeCubicVec_32f16s;
-typedef VResizeNoVec VResizeCubicVec_32f;
-
-typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
-typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
-typedef VResizeNoVec VResizeLanczos4Vec_32f;
-
-#endif
-
-typedef HResizeNoVec HResizeLinearVec_8u32s;
-typedef HResizeNoVec HResizeLinearVec_16u32f;
-typedef HResizeNoVec HResizeLinearVec_16s32f;
-typedef HResizeNoVec HResizeLinearVec_32f;
-typedef HResizeNoVec HResizeLinearVec_64f;
-
-
-template<typename T, typename WT, typename AT, int ONE, class VecOp>
-struct HResizeLinear
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const T** src, WT** dst, int count,
-                    const int* xofs, const AT* alpha,
-                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
-    {
-        int dx, k;
-        VecOp vecOp;
-
-        int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
-            xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
-
-        for( k = 0; k <= count - 2; k++ )
-        {
-            const T *S0 = src[k], *S1 = src[k+1];
-            WT *D0 = dst[k], *D1 = dst[k+1];
-            for( dx = dx0; dx < xmax; dx++ )
-            {
-                int sx = xofs[dx];
-                WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
-                WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
-                WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
-                D0[dx] = t0; D1[dx] = t1;
-            }
-
-            for( ; dx < dwidth; dx++ )
-            {
-                int sx = xofs[dx];
-                D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
-            }
-        }
-
-        for( ; k < count; k++ )
-        {
-            const T *S = src[k];
-            WT *D = dst[k];
-            for( dx = 0; dx < xmax; dx++ )
-            {
-                int sx = xofs[dx];
-                D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
-            }
-
-            for( ; dx < dwidth; dx++ )
-                D[dx] = WT(S[xofs[dx]]*ONE);
-        }
-    }
-};
-
-
-template<typename T, typename WT, typename AT, class CastOp, class VecOp>
-struct VResizeLinear
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
-    {
-        WT b0 = beta[0], b1 = beta[1];
-        const WT *S0 = src[0], *S1 = src[1];
-        CastOp castOp;
-        VecOp vecOp;
-
-        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            WT t0, t1;
-            t0 = S0[x]*b0 + S1[x]*b1;
-            t1 = S0[x+1]*b0 + S1[x+1]*b1;
-            dst[x] = castOp(t0); dst[x+1] = castOp(t1);
-            t0 = S0[x+2]*b0 + S1[x+2]*b1;
-            t1 = S0[x+3]*b0 + S1[x+3]*b1;
-            dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
-        }
-        #endif
-        for( ; x < width; x++ )
-            dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
-    }
-};
-
-template<>
-struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
-{
-    typedef uchar value_type;
-    typedef int buf_type;
-    typedef short alpha_type;
-
-    void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
-    {
-        alpha_type b0 = beta[0], b1 = beta[1];
-        const buf_type *S0 = src[0], *S1 = src[1];
-        VResizeLinearVec_32s8u vecOp;
-
-        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
-            dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
-            dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
-            dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
-        }
-        #endif
-        for( ; x < width; x++ )
-            dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
-    }
-};
-
-
-template<typename T, typename WT, typename AT>
-struct HResizeCubic
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const T** src, WT** dst, int count,
-                    const int* xofs, const AT* alpha,
-                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
-    {
-        for( int k = 0; k < count; k++ )
-        {
-            const T *S = src[k];
-            WT *D = dst[k];
-            int dx = 0, limit = xmin;
-            for(;;)
-            {
-                for( ; dx < limit; dx++, alpha += 4 )
-                {
-                    int j, sx = xofs[dx] - cn;
-                    WT v = 0;
-                    for( j = 0; j < 4; j++ )
-                    {
-                        int sxj = sx + j*cn;
-                        if( (unsigned)sxj >= (unsigned)swidth )
-                        {
-                            while( sxj < 0 )
-                                sxj += cn;
-                            while( sxj >= swidth )
-                                sxj -= cn;
-                        }
-                        v += S[sxj]*alpha[j];
-                    }
-                    D[dx] = v;
-                }
-                if( limit == dwidth )
-                    break;
-                for( ; dx < xmax; dx++, alpha += 4 )
-                {
-                    int sx = xofs[dx];
-                    D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
-                        S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
-                }
-                limit = dwidth;
-            }
-            alpha -= dwidth*4;
-        }
-    }
-};
-
-
-template<typename T, typename WT, typename AT, class CastOp, class VecOp>
-struct VResizeCubic
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
-    {
-        WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
-        const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        CastOp castOp;
-        VecOp vecOp;
-
-        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
-        for( ; x < width; x++ )
-            dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
-    }
-};
-
-
-template<typename T, typename WT, typename AT>
-struct HResizeLanczos4
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const T** src, WT** dst, int count,
-                    const int* xofs, const AT* alpha,
-                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
-    {
-        for( int k = 0; k < count; k++ )
-        {
-            const T *S = src[k];
-            WT *D = dst[k];
-            int dx = 0, limit = xmin;
-            for(;;)
-            {
-                for( ; dx < limit; dx++, alpha += 8 )
-                {
-                    int j, sx = xofs[dx] - cn*3;
-                    WT v = 0;
-                    for( j = 0; j < 8; j++ )
-                    {
-                        int sxj = sx + j*cn;
-                        if( (unsigned)sxj >= (unsigned)swidth )
-                        {
-                            while( sxj < 0 )
-                                sxj += cn;
-                            while( sxj >= swidth )
-                                sxj -= cn;
-                        }
-                        v += S[sxj]*alpha[j];
-                    }
-                    D[dx] = v;
-                }
-                if( limit == dwidth )
-                    break;
-                for( ; dx < xmax; dx++, alpha += 8 )
-                {
-                    int sx = xofs[dx];
-                    D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
-                        S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
-                        S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
-                        S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
-                }
-                limit = dwidth;
-            }
-            alpha -= dwidth*8;
-        }
-    }
-};
-
-
-template<typename T, typename WT, typename AT, class CastOp, class VecOp>
-struct VResizeLanczos4
-{
-    typedef T value_type;
-    typedef WT buf_type;
-    typedef AT alpha_type;
-
-    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
-    {
-        CastOp castOp;
-        VecOp vecOp;
-        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            WT b = beta[0];
-            const WT* S = src[0];
-            WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
-
-            for( int k = 1; k < 8; k++ )
-            {
-                b = beta[k]; S = src[k];
-                s0 += S[x]*b; s1 += S[x+1]*b;
-                s2 += S[x+2]*b; s3 += S[x+3]*b;
-            }
-
-            dst[x] = castOp(s0); dst[x+1] = castOp(s1);
-            dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
-        }
-        #endif
-        for( ; x < width; x++ )
-        {
-            dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
-                src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
-                src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
-        }
-    }
-};
-
-
-static inline int clip(int x, int a, int b)
-{
-    return x >= a ? (x < b ? x : b-1) : a;
-}
-
-static const int MAX_ESIZE=16;
-
-template <typename HResize, typename VResize>
-class resizeGeneric_Invoker :
-    public ParallelLoopBody
-{
-public:
-    typedef typename HResize::value_type T;
-    typedef typename HResize::buf_type WT;
-    typedef typename HResize::alpha_type AT;
-
-    resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
-        const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
-        int _ksize, int _xmin, int _xmax) :
-        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
-        alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
-        ksize(_ksize), xmin(_xmin), xmax(_xmax)
-    {
-        CV_Assert(ksize <= MAX_ESIZE);
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        int dy, cn = src.channels();
-        HResize hresize;
-        VResize vresize;
-
-        int bufstep = (int)alignSize(dsize.width, 16);
-        AutoBuffer<WT> _buffer(bufstep*ksize);
-        const T* srows[MAX_ESIZE]={0};
-        WT* rows[MAX_ESIZE]={0};
-        int prev_sy[MAX_ESIZE];
-
-        for(int k = 0; k < ksize; k++ )
-        {
-            prev_sy[k] = -1;
-            rows[k] = (WT*)_buffer + bufstep*k;
-        }
-
-        const AT* beta = _beta + ksize * range.start;
-
-        for( dy = range.start; dy < range.end; dy++, beta += ksize )
-        {
-            int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
-
-            for(int k = 0; k < ksize; k++ )
-            {
-                int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
-                for( k1 = std::max(k1, k); k1 < ksize; k1++ )
-                {
-                    if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
-                    {
-                        if( k1 > k )
-                            memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
-                        break;
-                    }
-                }
-                if( k1 == ksize )
-                    k0 = std::min(k0, k); // remember the first row that needs to be computed
-                srows[k] = src.template ptr<T>(sy);
-                prev_sy[k] = sy;
-            }
-
-            if( k0 < ksize )
-                hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
-                        ssize.width, dsize.width, cn, xmin, xmax );
-            vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
-        }
-    }
-
-private:
-    Mat src;
-    Mat dst;
-    const int* xofs, *yofs;
-    const AT* alpha, *_beta;
-    Size ssize, dsize;
-    const int ksize, xmin, xmax;
-
-    resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
-};
-
-template<class HResize, class VResize>
-static void resizeGeneric_( const Mat& src, Mat& dst,
-                            const int* xofs, const void* _alpha,
-                            const int* yofs, const void* _beta,
-                            int xmin, int xmax, int ksize )
-{
-    typedef typename HResize::alpha_type AT;
-
-    const AT* beta = (const AT*)_beta;
-    Size ssize = src.size(), dsize = dst.size();
-    int cn = src.channels();
-    ssize.width *= cn;
-    dsize.width *= cn;
-    xmin *= cn;
-    xmax *= cn;
-    // image resize is a separable operation. In case of not too strong
-
-    Range range(0, dsize.height);
-    resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
-        ssize, dsize, ksize, xmin, xmax);
-    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
-}
-
-template <typename T, typename WT>
-struct ResizeAreaFastNoVec
-{
-    ResizeAreaFastNoVec(int, int) { }
-    ResizeAreaFastNoVec(int, int, int, int) { }
-    int operator() (const T*, T*, int) const
-    { return 0; }
-};
-
-#if CV_NEON
-
-class ResizeAreaFastVec_SIMD_8u
-{
-public:
-    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-    }
-
-    int operator() (const uchar* S, uchar* D, int w) const
-    {
-        int dx = 0;
-        const uchar* S0 = S, * S1 = S0 + step;
-
-        uint16x8_t v_2 = vdupq_n_u16(2);
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
-            {
-                uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
-
-                uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
-                v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
-                v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
-
-                uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
-                v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
-                v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
-
-                vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
-            }
-        }
-        else if (cn == 4)
-        {
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
-            {
-                uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
-
-                uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
-                uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
-                uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
-                uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
-
-                uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
-                                           vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
-                uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
-                                           vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
-                uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
-
-                vst1_u8(D, vmovn_u16(v_dst));
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn, step;
-};
-
-class ResizeAreaFastVec_SIMD_16u
-{
-public:
-    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-    }
-
-    int operator() (const ushort * S, ushort * D, int w) const
-    {
-        int dx = 0;
-        const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
-
-        uint32x4_t v_2 = vdupq_n_u32(2);
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
-            {
-                uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
-
-                uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
-                v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
-                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
-
-                uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
-                v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
-                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
-
-                vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
-            }
-        }
-        else if (cn == 4)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
-                uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
-                                             vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
-                vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn, step;
-};
-
-class ResizeAreaFastVec_SIMD_16s
-{
-public:
-    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-    }
-
-    int operator() (const short * S, short * D, int w) const
-    {
-        int dx = 0;
-        const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
-
-        int32x4_t v_2 = vdupq_n_s32(2);
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
-            {
-                int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
-
-                int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
-                v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
-                v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
-
-                int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
-                v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
-                v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
-
-                vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
-            }
-        }
-        else if (cn == 4)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
-                int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
-                                            vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
-                vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn, step;
-};
-
-struct ResizeAreaFastVec_SIMD_32f
-{
-    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
-    }
-
-    int operator() (const float * S, float * D, int w) const
-    {
-        if (!fast_mode)
-            return 0;
-
-        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
-        int dx = 0;
-
-        float32x4_t v_025 = vdupq_n_f32(0.25f);
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
-
-                float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
-                float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
-
-                vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
-            }
-        }
-        else if (cn == 4)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
-                float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
-
-                vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn;
-    bool fast_mode;
-    int step;
-};
-
-#elif CV_SSE2
-
-class ResizeAreaFastVec_SIMD_8u
-{
-public:
-    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const uchar* S, uchar* D, int w) const
-    {
-        if (!use_simd)
-            return 0;
-
-        int dx = 0;
-        const uchar* S0 = S;
-        const uchar* S1 = S0 + step;
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi16(2);
-
-        if (cn == 1)
-        {
-            __m128i masklow = _mm_set1_epi16(0x00ff);
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        }
-        else if (cn == 3)
-            for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
-                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
-                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
-
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)(D+3), s0);
-            }
-        else
-        {
-            CV_Assert(cn == 4);
-            int v[] = { 0, 0, -1, -1 };
-            __m128i mask = _mm_loadu_si128((const __m128i*)v);
-
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
-                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
-                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
-
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res0 = _mm_srli_epi16(s0, 2);
-
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res1 = _mm_srli_epi16(s0, 2);
-                s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
-                                                   _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
-                _mm_storel_epi64((__m128i*)(D), s0);
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn;
-    bool use_simd;
-    int step;
-};
-
-class ResizeAreaFastVec_SIMD_16u
-{
-public:
-    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const ushort* S, ushort* D, int w) const
-    {
-        if (!use_simd)
-            return 0;
-
-        int dx = 0;
-        const ushort* S0 = (const ushort*)S;
-        const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
-
-#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srli_epi32(s0, 2);
-                s0 = _mm_packus_epi32(s0, zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        }
-        else if (cn == 3)
-            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
-                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
-                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
-                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
-
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        else
-        {
-            CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
-                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
-                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
-                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
-
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        }
-
-#undef _mm_packus_epi32
-
-        return dx;
-    }
-
-private:
-    int cn;
-    int step;
-    bool use_simd;
-};
-
-class ResizeAreaFastVec_SIMD_16s
-{
-public:
-    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short* S, short* D, int w) const
-    {
-        if (!use_simd)
-            return 0;
-
-        int dx = 0;
-        const short* S0 = (const short*)S;
-        const short* S1 = (const short*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
-
-        if (cn == 1)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
-                __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srai_epi32(s0, 2);
-                s0 = _mm_packs_epi32(s0, zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        }
-        else if (cn == 3)
-            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
-                __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
-                __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
-                __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
-
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        else
-        {
-            CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
-                __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
-                __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
-                __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
-
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn;
-    int step;
-    bool use_simd;
-};
-
-struct ResizeAreaFastVec_SIMD_32f
-{
-    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
-        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const float * S, float * D, int w) const
-    {
-        if (!fast_mode)
-            return 0;
-
-        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
-        int dx = 0;
-
-        __m128 v_025 = _mm_set1_ps(0.25f);
-
-        if (cn == 1)
-        {
-            const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
-                       v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
-
-                __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
-                                           _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
-                __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
-                                           _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
-
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
-            }
-        }
-        else if (cn == 4)
-        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
-            {
-                __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
-                __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
-
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
-            }
-        }
-
-        return dx;
-    }
-
-private:
-    int cn;
-    bool fast_mode;
-    int step;
-};
-
-#else
-
-typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
-typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
-typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
-typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
-
-#endif
-
-template<typename T, typename SIMDVecOp>
-struct ResizeAreaFastVec
-{
-    ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
-        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
-    {
-        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
-    }
-
-    int operator() (const T* S, T* D, int w) const
-    {
-        if (!fast_mode)
-            return 0;
-
-        const T* nextS = (const T*)((const uchar*)S + step);
-        int dx = vecOp(S, D, w);
-
-        if (cn == 1)
-            for( ; dx < w; ++dx )
-            {
-                int index = dx*2;
-                D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
-            }
-        else if (cn == 3)
-            for( ; dx < w; dx += 3 )
-            {
-                int index = dx*2;
-                D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
-                D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
-                D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
-            }
-        else
-            {
-                CV_Assert(cn == 4);
-                for( ; dx < w; dx += 4 )
-                {
-                    int index = dx*2;
-                    D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
-                    D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
-                    D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
-                    D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
-                }
-            }
-
-        return dx;
-    }
-
-private:
-    int scale_x, scale_y;
-    int cn;
-    bool fast_mode;
-    int step;
-    SIMDVecOp vecOp;
-};
-
-template <typename T, typename WT, typename VecOp>
-class resizeAreaFast_Invoker :
-    public ParallelLoopBody
-{
-public:
-    resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
-        int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
-        ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
-        scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
-    {
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int cn = src.channels();
-        int area = scale_x*scale_y;
-        float scale = 1.f/(area);
-        int dwidth1 = (ssize.width/scale_x)*cn;
-        dsize.width *= cn;
-        ssize.width *= cn;
-        int dy, dx, k = 0;
-
-        VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
-
-        for( dy = range.start; dy < range.end; dy++ )
-        {
-            T* D = (T*)(dst.data + dst.step*dy);
-            int sy0 = dy*scale_y;
-            int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
-
-            if( sy0 >= ssize.height )
-            {
-                for( dx = 0; dx < dsize.width; dx++ )
-                    D[dx] = 0;
-                continue;
-            }
-
-            dx = vop(src.template ptr<T>(sy0), D, w);
-            for( ; dx < w; dx++ )
-            {
-                const T* S = src.template ptr<T>(sy0) + xofs[dx];
-                WT sum = 0;
-                k = 0;
-                #if CV_ENABLE_UNROLLED
-                for( ; k <= area - 4; k += 4 )
-                    sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
-                #endif
-                for( ; k < area; k++ )
-                    sum += S[ofs[k]];
-
-                D[dx] = saturate_cast<T>(sum * scale);
-            }
-
-            for( ; dx < dsize.width; dx++ )
-            {
-                WT sum = 0;
-                int count = 0, sx0 = xofs[dx];
-                if( sx0 >= ssize.width )
-                    D[dx] = 0;
-
-                for( int sy = 0; sy < scale_y; sy++ )
-                {
-                    if( sy0 + sy >= ssize.height )
-                        break;
-                    const T* S = src.template ptr<T>(sy0 + sy) + sx0;
-                    for( int sx = 0; sx < scale_x*cn; sx += cn )
-                    {
-                        if( sx0 + sx >= ssize.width )
-                            break;
-                        sum += S[sx];
-                        count++;
-                    }
-                }
-
-                D[dx] = saturate_cast<T>((float)sum/count);
-            }
-        }
-    }
-
-private:
-    Mat src;
-    Mat dst;
-    int scale_x, scale_y;
-    const int *ofs, *xofs;
-};
-
-template<typename T, typename WT, typename VecOp>
-static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
-                             int scale_x, int scale_y )
-{
-    Range range(0, dst.rows);
-    resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
-        scale_y, ofs, xofs);
-    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
-}
-
-struct DecimateAlpha
-{
-    int si, di;
-    float alpha;
-};
-
-
-template<typename T, typename WT> class ResizeArea_Invoker :
-    public ParallelLoopBody
-{
-public:
-    ResizeArea_Invoker( const Mat& _src, Mat& _dst,
-                        const DecimateAlpha* _xtab, int _xtab_size,
-                        const DecimateAlpha* _ytab, int _ytab_size,
-                        const int* _tabofs )
-    {
-        src = &_src;
-        dst = &_dst;
-        xtab0 = _xtab;
-        xtab_size0 = _xtab_size;
-        ytab = _ytab;
-        ytab_size = _ytab_size;
-        tabofs = _tabofs;
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        Size dsize = dst->size();
-        int cn = dst->channels();
-        dsize.width *= cn;
-        AutoBuffer<WT> _buffer(dsize.width*2);
-        const DecimateAlpha* xtab = xtab0;
-        int xtab_size = xtab_size0;
-        WT *buf = _buffer, *sum = buf + dsize.width;
-        int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
-
-        for( dx = 0; dx < dsize.width; dx++ )
-            sum[dx] = (WT)0;
-
-        for( j = j_start; j < j_end; j++ )
-        {
-            WT beta = ytab[j].alpha;
-            int dy = ytab[j].di;
-            int sy = ytab[j].si;
-
-            {
-                const T* S = src->template ptr<T>(sy);
-                for( dx = 0; dx < dsize.width; dx++ )
-                    buf[dx] = (WT)0;
-
-                if( cn == 1 )
-                    for( k = 0; k < xtab_size; k++ )
-                    {
-                        int dxn = xtab[k].di;
-                        WT alpha = xtab[k].alpha;
-                        buf[dxn] += S[xtab[k].si]*alpha;
-                    }
-                else if( cn == 2 )
-                    for( k = 0; k < xtab_size; k++ )
-                    {
-                        int sxn = xtab[k].si;
-                        int dxn = xtab[k].di;
-                        WT alpha = xtab[k].alpha;
-                        WT t0 = buf[dxn] + S[sxn]*alpha;
-                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                        buf[dxn] = t0; buf[dxn+1] = t1;
-                    }
-                else if( cn == 3 )
-                    for( k = 0; k < xtab_size; k++ )
-                    {
-                        int sxn = xtab[k].si;
-                        int dxn = xtab[k].di;
-                        WT alpha = xtab[k].alpha;
-                        WT t0 = buf[dxn] + S[sxn]*alpha;
-                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                        WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
-                        buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
-                    }
-                else if( cn == 4 )
-                {
-                    for( k = 0; k < xtab_size; k++ )
-                    {
-                        int sxn = xtab[k].si;
-                        int dxn = xtab[k].di;
-                        WT alpha = xtab[k].alpha;
-                        WT t0 = buf[dxn] + S[sxn]*alpha;
-                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                        buf[dxn] = t0; buf[dxn+1] = t1;
-                        t0 = buf[dxn+2] + S[sxn+2]*alpha;
-                        t1 = buf[dxn+3] + S[sxn+3]*alpha;
-                        buf[dxn+2] = t0; buf[dxn+3] = t1;
-                    }
-                }
-                else
-                {
-                    for( k = 0; k < xtab_size; k++ )
-                    {
-                        int sxn = xtab[k].si;
-                        int dxn = xtab[k].di;
-                        WT alpha = xtab[k].alpha;
-                        for( int c = 0; c < cn; c++ )
-                            buf[dxn + c] += S[sxn + c]*alpha;
-                    }
-                }
-            }
-
-            if( dy != prev_dy )
-            {
-                T* D = dst->template ptr<T>(prev_dy);
-
-                for( dx = 0; dx < dsize.width; dx++ )
-                {
-                    D[dx] = saturate_cast<T>(sum[dx]);
-                    sum[dx] = beta*buf[dx];
-                }
-                prev_dy = dy;
-            }
-            else
-            {
-                for( dx = 0; dx < dsize.width; dx++ )
-                    sum[dx] += beta*buf[dx];
-            }
-        }
-
-        {
-        T* D = dst->template ptr<T>(prev_dy);
-        for( dx = 0; dx < dsize.width; dx++ )
-            D[dx] = saturate_cast<T>(sum[dx]);
-        }
-    }
-
-private:
-    const Mat* src;
-    Mat* dst;
-    const DecimateAlpha* xtab0;
-    const DecimateAlpha* ytab;
-    int xtab_size0, ytab_size;
-    const int* tabofs;
-};
-
-
-template <typename T, typename WT>
-static void resizeArea_( const Mat& src, Mat& dst,
-                         const DecimateAlpha* xtab, int xtab_size,
-                         const DecimateAlpha* ytab, int ytab_size,
-                         const int* tabofs )
-{
-    parallel_for_(Range(0, dst.rows),
-                 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
-                 dst.total()/((double)(1 << 16)));
-}
-
-
-typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
-                            const int* xofs, const void* alpha,
-                            const int* yofs, const void* beta,
-                            int xmin, int xmax, int ksize );
-
-typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
-                                    const int* ofs, const int *xofs,
-                                    int scale_x, int scale_y );
-
-typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
-                                const DecimateAlpha* xtab, int xtab_size,
-                                const DecimateAlpha* ytab, int ytab_size,
-                                const int* yofs);
-
-
-static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
-{
-    int k = 0;
-    for(int dx = 0; dx < dsize; dx++ )
-    {
-        double fsx1 = dx * scale;
-        double fsx2 = fsx1 + scale;
-        double cellWidth = std::min(scale, ssize - fsx1);
-
-        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
-
-        sx2 = std::min(sx2, ssize - 1);
-        sx1 = std::min(sx1, sx2);
-
-        if( sx1 - fsx1 > 1e-3 )
-        {
-            assert( k < ssize*2 );
-            tab[k].di = dx * cn;
-            tab[k].si = (sx1 - 1) * cn;
-            tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
-        }
-
-        for(int sx = sx1; sx < sx2; sx++ )
-        {
-            assert( k < ssize*2 );
-            tab[k].di = dx * cn;
-            tab[k].si = sx * cn;
-            tab[k++].alpha = float(1.0 / cellWidth);
-        }
-
-        if( fsx2 - sx2 > 1e-3 )
-        {
-            assert( k < ssize*2 );
-            tab[k].di = dx * cn;
-            tab[k].si = sx2 * cn;
-            tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
-        }
-    }
-    return k;
-}
-
-#ifdef HAVE_OPENCL
-static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
-                                      float * const alpha_tab, int * const ofs_tab)
-{
-    int k = 0, dx = 0;
-    for ( ; dx < dsize; dx++)
-    {
-        ofs_tab[dx] = k;
-
-        double fsx1 = dx * scale;
-        double fsx2 = fsx1 + scale;
-        double cellWidth = std::min(scale, ssize - fsx1);
-
-        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
-
-        sx2 = std::min(sx2, ssize - 1);
-        sx1 = std::min(sx1, sx2);
-
-        if (sx1 - fsx1 > 1e-3)
-        {
-            map_tab[k] = sx1 - 1;
-            alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
-        }
-
-        for (int sx = sx1; sx < sx2; sx++)
-        {
-            map_tab[k] = sx;
-            alpha_tab[k++] = float(1.0 / cellWidth);
-        }
-
-        if (fsx2 - sx2 > 1e-3)
-        {
-            map_tab[k] = sx2;
-            alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
-        }
-    }
-    ofs_tab[dx] = k;
-}
-
-static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
-                        double fx, double fy, int interpolation)
-{
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-
-    double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
-    float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
-    int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
-    bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
-        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
-
-    // in case of scale_x && scale_y is equal to 2
-    // INTER_AREA (fast) also is equal to INTER_LINEAR
-    if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
-        /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
-
-    if( !(cn <= 4 &&
-           (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
-            (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
-        return false;
-
-    UMat src = _src.getUMat();
-    _dst.create(dsize, type);
-    UMat dst = _dst.getUMat();
-
-    Size ssize = src.size();
-    ocl::Kernel k;
-    size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
-
-    ocl::Image2D srcImage;
-
-    // See if this could be done with a sampler.  We stick with integer
-    // datatypes because the observed error is low.
-    bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
-                       ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
-                       ocl::Image2D::isFormatSupported(depth, cn, true) &&
-                       src.offset==0);
-    if (useSampler)
-    {
-        int wdepth = std::max(depth, CV_32S);
-        char buf[2][32];
-        cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
-                        "-D convertToDT=%s -D cn=%d",
-                        depth, ocl::typeToStr(type), ocl::typeToStr(depth),
-                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
-                        cn);
-        k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
-
-        if (k.empty())
-            useSampler = false;
-        else
-        {
-            // Convert the input into an OpenCL image type, using normalized channel data types
-            // and aliasing the UMat.
-            srcImage = ocl::Image2D(src, true, true);
-            k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
-                   (float)inv_fx, (float)inv_fy);
-        }
-    }
-
-    if (interpolation == INTER_LINEAR && !useSampler)
-    {
-        char buf[2][32];
-
-        // integer path is slower because of CPU part, so it's disabled
-        if (depth == CV_8U && ((void)0, 0))
-        {
-            AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
-            int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
-            short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
-            float fxx, fyy;
-            int sx, sy;
-
-            for (int dx = 0; dx < dsize.width; dx++)
-            {
-                fxx = (float)((dx+0.5)*inv_fx - 0.5);
-                sx = cvFloor(fxx);
-                fxx -= sx;
-
-                if (sx < 0)
-                    fxx = 0, sx = 0;
-
-                if (sx >= ssize.width-1)
-                    fxx = 0, sx = ssize.width-1;
-
-                xofs[dx] = sx;
-                ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
-                ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
-            }
-
-            for (int dy = 0; dy < dsize.height; dy++)
-            {
-                fyy = (float)((dy+0.5)*inv_fy - 0.5);
-                sy = cvFloor(fyy);
-                fyy -= sy;
-
-                yofs[dy] = sy;
-                ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
-                ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
-            }
-
-            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
-            UMat coeffs;
-            Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
-
-            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                     format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
-                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
-                            "-D INTER_RESIZE_COEF_BITS=%d",
-                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
-                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
-                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
-                            cn, INTER_RESIZE_COEF_BITS));
-            if (k.empty())
-                return false;
-
-            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-                   ocl::KernelArg::PtrReadOnly(coeffs));
-        }
-        else
-        {
-            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
-            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                     format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
-                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
-                            "-D INTER_RESIZE_COEF_BITS=%d",
-                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
-                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
-                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
-                            cn, INTER_RESIZE_COEF_BITS));
-            if (k.empty())
-                return false;
-
-            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-                   (float)inv_fx, (float)inv_fy);
-        }
-    }
-    else if (interpolation == INTER_NEAREST)
-    {
-        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
-                        ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
-        if (k.empty())
-            return false;
-
-        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-               (float)inv_fx, (float)inv_fy);
-    }
-    else if (interpolation == INTER_AREA)
-    {
-        int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
-        int wtype = CV_MAKE_TYPE(wdepth, cn);
-
-        char cvt[2][40];
-        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
-                                    ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
-                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
-
-        UMat alphaOcl, tabofsOcl, mapOcl;
-        UMat dmap, smap;
-
-        if (is_area_fast)
-        {
-            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
-            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
-                                                " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
-                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
-                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
-                                    iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
-
-            k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
-            if (k.empty())
-                return false;
-        }
-        else
-        {
-            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
-            k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
-            if (k.empty())
-                return false;
-
-            int xytab_size = (ssize.width + ssize.height) << 1;
-            int tabofs_size = dsize.height + dsize.width + 2;
-
-            AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
-            AutoBuffer<float> _xyalpha_tab(xytab_size);
-            int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
-            float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
-            int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
-
-            ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
-            ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
-
-            // loading precomputed arrays to GPU
-            Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
-            Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
-            Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
-        }
-
-        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
-
-        if (is_area_fast)
-            k.args(srcarg, dstarg);
-        else
-            k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
-                   ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
-
-        return k.run(2, globalsize, NULL, false);
-    }
-
-    return k.run(2, globalsize, 0, false);
-}
-
-#endif
-
-#ifdef HAVE_IPP
-#define IPP_RESIZE_PARALLEL 1
-
-#ifdef HAVE_IPP_IW
-class ipp_resizeParallel: public ParallelLoopBody
-{
-public:
-    ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
-        m_src(src), m_dst(dst), m_ok(ok) {}
-    ~ipp_resizeParallel()
-    {
-    }
-
-    void Init(IppiInterpolationType inter)
-    {
-        iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl);
-
-        m_ok = true;
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        CV_INSTRUMENT_REGION_IPP()
-
-        if(!m_ok)
-            return;
-
-        try
-        {
-            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
-            CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile);
-        }
-        catch(::ipp::IwException)
-        {
-            m_ok = false;
-            return;
-        }
-    }
-private:
-    ::ipp::IwiImage &m_src;
-    ::ipp::IwiImage &m_dst;
-
-    mutable ::ipp::IwiResize iwiResize;
-
-    volatile bool &m_ok;
-    const ipp_resizeParallel& operator= (const ipp_resizeParallel&);
-};
-
-class ipp_resizeAffineParallel: public ParallelLoopBody
-{
-public:
-    ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
-        m_src(src), m_dst(dst), m_ok(ok) {}
-    ~ipp_resizeAffineParallel()
-    {
-    }
-
-    void Init(IppiInterpolationType inter, double scaleX, double scaleY)
-    {
-        double shift = (inter == ippNearest)?-1e-10:-0.5;
-        double coeffs[2][3] = {
-            {scaleX, 0,      shift+0.5*scaleX},
-            {0,      scaleY, shift+0.5*scaleY}
-        };
-
-        iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl);
-
-        m_ok = true;
-    }
-
-    virtual void operator() (const Range& range) const
-    {
-        CV_INSTRUMENT_REGION_IPP()
-
-        if(!m_ok)
-            return;
-
-        try
-        {
-            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
-            CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile);
-        }
-        catch(::ipp::IwException)
-        {
-            m_ok = false;
-            return;
-        }
-    }
-private:
-    ::ipp::IwiImage &m_src;
-    ::ipp::IwiImage &m_dst;
-
-    mutable ::ipp::IwiWarpAffine iwiWarpAffine;
-
-    volatile bool &m_ok;
-    const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&);
-};
-#endif
-
-static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height,
-            uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y,
-            int depth, int channels, int interpolation)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP()
-
-    IppDataType           ippDataType = ippiGetDataType(depth);
-    IppiInterpolationType ippInter    = ippiGetInterpolation(interpolation);
-    if(ippInter < 0)
-        return false;
-
-    // Resize which doesn't match OpenCV exactly
-    if(!cv::ipp::useIPP_NE())
-    {
-        if(ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear))
-            return false;
-    }
-
-    if(ippInter != ippLinear && ippDataType == ipp64f)
-        return false;
-
-#if IPP_VERSION_X100 < 201801
-    // Degradations on int^2 linear downscale
-    if(ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale
-    {
-        int scale_x = (int)(1/inv_scale_x);
-        int scale_y = (int)(1/inv_scale_y);
-        if(1/inv_scale_x - scale_x < DBL_EPSILON && 1/inv_scale_y - scale_y < DBL_EPSILON) // if integer
-        {
-            if(!(scale_x&(scale_x-1)) && !(scale_y&(scale_y-1))) // if power of 2
-                return false;
-        }
-    }
-#endif
-
-    bool  affine = false;
-    const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10;
-    double ex = fabs((double)dst_width / src_width  - inv_scale_x) / inv_scale_x;
-    double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y;
-
-    // Use affine transform resize to allow sub-pixel accuracy
-    if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS)
-        affine = true;
-
-    // Affine doesn't support Lanczos and Super interpolations
-    if(affine && (ippInter == ippLanczos || ippInter == ippSuper))
-        return false;
-
-    try
-    {
-        ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step);
-        ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step);
-
-        bool  ok;
-        int   threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height)));
-        Range range(0, dst_height);
-        ipp_resizeParallel       invokerGeneral(iwSrc, iwDst, ok);
-        ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok);
-        ParallelLoopBody        *pInvoker = NULL;
-        if(affine)
-        {
-            pInvoker = &invokerAffine;
-            invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y);
-        }
-        else
-        {
-            pInvoker = &invokerGeneral;
-            invokerGeneral.Init(ippInter);
-        }
-
-        if(IPP_RESIZE_PARALLEL && threads > 1)
-            parallel_for_(range, *pInvoker, threads*4);
-        else
-            pInvoker->operator()(range);
-
-        if(!ok)
-            return false;
-    }
-    catch(::ipp::IwException)
-    {
-        return false;
-    }
-    return true;
-#else
-    CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step);
-    CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth);
-    CV_UNUSED(channels); CV_UNUSED(interpolation);
-    return false;
-#endif
-}
+const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
+
+static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
+
+static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
+static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
+
+#if CV_SSE2 || CV_NEON
+static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
+static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
 #endif
 
-//==================================================================================================
+static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
+static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
 
-namespace hal {
+static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
+static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
 
-void resize(int src_type,
-            const uchar * src_data, size_t src_step, int src_width, int src_height,
-            uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
-            double inv_scale_x, double inv_scale_y, int interpolation)
+static inline void interpolateLinear( float x, float* coeffs )
 {
-    CV_INSTRUMENT_REGION()
-
-    CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
-    if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
-    {
-        inv_scale_x = static_cast<double>(dst_width) / src_width;
-        inv_scale_y = static_cast<double>(dst_height) / src_height;
-    }
+    coeffs[0] = 1.f - x;
+    coeffs[1] = x;
+}
 
-    CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
+static inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
 
-    int  depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
-    Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
-                        saturate_cast<int>(src_height*inv_scale_y));
-    CV_Assert( dsize.area() > 0 );
+    coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
+    coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
+    coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
 
-    CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation))
+static inline void interpolateLanczos4( float x, float* coeffs )
+{
+    static const double s45 = 0.70710678118654752440084436210485;
+    static const double cs[][2]=
+    {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
 
-    static ResizeFunc linear_tab[] =
+    if( x < FLT_EPSILON )
     {
-        resizeGeneric_<
-            HResizeLinear<uchar, int, short,
-                INTER_RESIZE_COEF_SCALE,
-                HResizeLinearVec_8u32s>,
-            VResizeLinear<uchar, int, short,
-                FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
-                VResizeLinearVec_32s8u> >,
-        0,
-        resizeGeneric_<
-            HResizeLinear<ushort, float, float, 1,
-                HResizeLinearVec_16u32f>,
-            VResizeLinear<ushort, float, float, Cast<float, ushort>,
-                VResizeLinearVec_32f16u> >,
-        resizeGeneric_<
-            HResizeLinear<short, float, float, 1,
-                HResizeLinearVec_16s32f>,
-            VResizeLinear<short, float, float, Cast<float, short>,
-                VResizeLinearVec_32f16s> >,
-        0,
-        resizeGeneric_<
-            HResizeLinear<float, float, float, 1,
-                HResizeLinearVec_32f>,
-            VResizeLinear<float, float, float, Cast<float, float>,
-                VResizeLinearVec_32f> >,
-        resizeGeneric_<
-            HResizeLinear<double, double, float, 1,
-                HResizeNoVec>,
-            VResizeLinear<double, double, float, Cast<double, double>,
-                VResizeNoVec> >,
-        0
-    };
+        for( int i = 0; i < 8; i++ )
+            coeffs[i] = 0;
+        coeffs[3] = 1;
+        return;
+    }
 
-    static ResizeFunc cubic_tab[] =
+    float sum = 0;
+    double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
+    for(int i = 0; i < 8; i++ )
     {
-        resizeGeneric_<
-            HResizeCubic<uchar, int, short>,
-            VResizeCubic<uchar, int, short,
-                FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
-                VResizeCubicVec_32s8u> >,
-        0,
-        resizeGeneric_<
-            HResizeCubic<ushort, float, float>,
-            VResizeCubic<ushort, float, float, Cast<float, ushort>,
-            VResizeCubicVec_32f16u> >,
-        resizeGeneric_<
-            HResizeCubic<short, float, float>,
-            VResizeCubic<short, float, float, Cast<float, short>,
-            VResizeCubicVec_32f16s> >,
-        0,
-        resizeGeneric_<
-            HResizeCubic<float, float, float>,
-            VResizeCubic<float, float, float, Cast<float, float>,
-            VResizeCubicVec_32f> >,
-        resizeGeneric_<
-            HResizeCubic<double, double, float>,
-            VResizeCubic<double, double, float, Cast<double, double>,
-            VResizeNoVec> >,
-        0
-    };
+        double y = -(x+3-i)*CV_PI*0.25;
+        coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
+        sum += coeffs[i];
+    }
 
-    static ResizeFunc lanczos4_tab[] =
-    {
-        resizeGeneric_<HResizeLanczos4<uchar, int, short>,
-            VResizeLanczos4<uchar, int, short,
-            FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
-            VResizeNoVec> >,
-        0,
-        resizeGeneric_<HResizeLanczos4<ushort, float, float>,
-            VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
-            VResizeLanczos4Vec_32f16u> >,
-        resizeGeneric_<HResizeLanczos4<short, float, float>,
-            VResizeLanczos4<short, float, float, Cast<float, short>,
-            VResizeLanczos4Vec_32f16s> >,
-        0,
-        resizeGeneric_<HResizeLanczos4<float, float, float>,
-            VResizeLanczos4<float, float, float, Cast<float, float>,
-            VResizeLanczos4Vec_32f> >,
-        resizeGeneric_<HResizeLanczos4<double, double, float>,
-            VResizeLanczos4<double, double, float, Cast<double, double>,
-            VResizeNoVec> >,
-        0
-    };
+    sum = 1.f/sum;
+    for(int i = 0; i < 8; i++ )
+        coeffs[i] *= sum;
+}
 
-    static ResizeAreaFastFunc areafast_tab[] =
+static void initInterTab1D(int method, float* tab, int tabsz)
+{
+    float scale = 1.f/tabsz;
+    if( method == INTER_LINEAR )
     {
-        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
-        0,
-        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
-        resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
-        0,
-        resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
-        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
-        0
-    };
-
-    static ResizeAreaFunc area_tab[] =
+        for( int i = 0; i < tabsz; i++, tab += 2 )
+            interpolateLinear( i*scale, tab );
+    }
+    else if( method == INTER_CUBIC )
     {
-        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
-        resizeArea_<short, float>, 0, resizeArea_<float, float>,
-        resizeArea_<double, double>, 0
-    };
-
-    double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
-
-    int iscale_x = saturate_cast<int>(scale_x);
-    int iscale_y = saturate_cast<int>(scale_y);
-
-    bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
-            std::abs(scale_y - iscale_y) < DBL_EPSILON;
-
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(dsize, src_type, dst_data, dst_step);
-
-    if( interpolation == INTER_NEAREST )
+        for( int i = 0; i < tabsz; i++, tab += 4 )
+            interpolateCubic( i*scale, tab );
+    }
+    else if( method == INTER_LANCZOS4 )
     {
-        resizeNN( src, dst, inv_scale_x, inv_scale_y );
-        return;
+        for( int i = 0; i < tabsz; i++, tab += 8 )
+            interpolateLanczos4( i*scale, tab );
     }
+    else
+        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+}
 
-    int k, sx, sy, dx, dy;
 
+static const void* initInterTab2D( int method, bool fixpt )
+{
+    static bool inittab[INTER_MAX+1] = {false};
+    float* tab = 0;
+    short* itab = 0;
+    int ksize = 0;
+    if( method == INTER_LINEAR )
+        tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
+    else if( method == INTER_CUBIC )
+        tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
+    else if( method == INTER_LANCZOS4 )
+        tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
+    else
+        CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
 
+    if( !inittab[method] )
     {
-        // in case of scale_x && scale_y is equal to 2
-        // INTER_AREA (fast) also is equal to INTER_LINEAR
-        if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
-            interpolation = INTER_AREA;
-
-        // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
-        // In other cases it is emulated using some variant of bilinear interpolation
-        if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
-        {
-            if( is_area_fast )
+        AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
+        int i, j, k1, k2;
+        initInterTab1D(method, _tab, INTER_TAB_SIZE);
+        for( i = 0; i < INTER_TAB_SIZE; i++ )
+            for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
             {
-                int area = iscale_x*iscale_y;
-                size_t srcstep = src_step / src.elemSize1();
-                AutoBuffer<int> _ofs(area + dsize.width*cn);
-                int* ofs = _ofs;
-                int* xofs = ofs + area;
-                ResizeAreaFastFunc func = areafast_tab[depth];
-                CV_Assert( func != 0 );
-
-                for( sy = 0, k = 0; sy < iscale_y; sy++ )
-                    for( sx = 0; sx < iscale_x; sx++ )
-                        ofs[k++] = (int)(sy*srcstep + sx*cn);
-
-                for( dx = 0; dx < dsize.width; dx++ )
+                int isum = 0;
+                NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
+                NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
+
+                for( k1 = 0; k1 < ksize; k1++ )
                 {
-                    int j = dx * cn;
-                    sx = iscale_x * j;
-                    for( k = 0; k < cn; k++ )
-                        xofs[j + k] = sx + k;
+                    float vy = _tab[i*ksize + k1];
+                    for( k2 = 0; k2 < ksize; k2++ )
+                    {
+                        float v = vy*_tab[j*ksize + k2];
+                        tab[k1*ksize + k2] = v;
+                        isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
+                    }
                 }
 
-                func( src, dst, ofs, xofs, iscale_x, iscale_y );
-                return;
-            }
-
-            ResizeAreaFunc func = area_tab[depth];
-            CV_Assert( func != 0 && cn <= 4 );
-
-            AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
-            DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
-
-            int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
-            int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
-
-            AutoBuffer<int> _tabofs(dsize.height + 1);
-            int* tabofs = _tabofs;
-            for( k = 0, dy = 0; k < ytab_size; k++ )
-            {
-                if( k == 0 || ytab[k].di != ytab[k-1].di )
+                if( isum != INTER_REMAP_COEF_SCALE )
                 {
-                    assert( ytab[k].di == dy );
-                    tabofs[dy++] = k;
+                    int diff = isum - INTER_REMAP_COEF_SCALE;
+                    int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
+                    for( k1 = ksize2; k1 < ksize2+2; k1++ )
+                        for( k2 = ksize2; k2 < ksize2+2; k2++ )
+                        {
+                            if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
+                                mk1 = k1, mk2 = k2;
+                            else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
+                                Mk1 = k1, Mk2 = k2;
+                        }
+                    if( diff < 0 )
+                        itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
+                    else
+                        itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
                 }
             }
-            tabofs[dy] = ytab_size;
-
-            func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
-            return;
-        }
-    }
-
-    int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
-    bool area_mode = interpolation == INTER_AREA;
-    bool fixpt = depth == CV_8U;
-    float fx, fy;
-    ResizeFunc func=0;
-    int ksize=0, ksize2;
-    if( interpolation == INTER_CUBIC )
-        ksize = 4, func = cubic_tab[depth];
-    else if( interpolation == INTER_LANCZOS4 )
-        ksize = 8, func = lanczos4_tab[depth];
-    else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
-        ksize = 2, func = linear_tab[depth];
-    else
-        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
-    ksize2 = ksize/2;
-
-    CV_Assert( func != 0 );
-
-    AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
-    int* xofs = (int*)(uchar*)_buffer;
-    int* yofs = xofs + width;
-    float* alpha = (float*)(yofs + dsize.height);
-    short* ialpha = (short*)alpha;
-    float* beta = alpha + width*ksize;
-    short* ibeta = ialpha + width*ksize;
-    float cbuf[MAX_ESIZE];
-
-    for( dx = 0; dx < dsize.width; dx++ )
-    {
-        if( !area_mode )
-        {
-            fx = (float)((dx+0.5)*scale_x - 0.5);
-            sx = cvFloor(fx);
-            fx -= sx;
-        }
-        else
-        {
-            sx = cvFloor(dx*scale_x);
-            fx = (float)((dx+1) - (sx+1)*inv_scale_x);
-            fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
-        }
-
-        if( sx < ksize2-1 )
-        {
-            xmin = dx+1;
-            if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
-                fx = 0, sx = 0;
-        }
-
-        if( sx + ksize2 >= src_width )
-        {
-            xmax = std::min( xmax, dx );
-            if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
-                fx = 0, sx = src_width-1;
-        }
-
-        for( k = 0, sx *= cn; k < cn; k++ )
-            xofs[dx*cn + k] = sx + k;
-
-        if( interpolation == INTER_CUBIC )
-            interpolateCubic( fx, cbuf );
-        else if( interpolation == INTER_LANCZOS4 )
-            interpolateLanczos4( fx, cbuf );
-        else
-        {
-            cbuf[0] = 1.f - fx;
-            cbuf[1] = fx;
-        }
-        if( fixpt )
-        {
-            for( k = 0; k < ksize; k++ )
-                ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
-            for( ; k < cn*ksize; k++ )
-                ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
-        }
-        else
-        {
-            for( k = 0; k < ksize; k++ )
-                alpha[dx*cn*ksize + k] = cbuf[k];
-            for( ; k < cn*ksize; k++ )
-                alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
-        }
-    }
-
-    for( dy = 0; dy < dsize.height; dy++ )
-    {
-        if( !area_mode )
-        {
-            fy = (float)((dy+0.5)*scale_y - 0.5);
-            sy = cvFloor(fy);
-            fy -= sy;
-        }
-        else
-        {
-            sy = cvFloor(dy*scale_y);
-            fy = (float)((dy+1) - (sy+1)*inv_scale_y);
-            fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
-        }
-
-        yofs[dy] = sy;
-        if( interpolation == INTER_CUBIC )
-            interpolateCubic( fy, cbuf );
-        else if( interpolation == INTER_LANCZOS4 )
-            interpolateLanczos4( fy, cbuf );
-        else
-        {
-            cbuf[0] = 1.f - fy;
-            cbuf[1] = fy;
-        }
-
-        if( fixpt )
-        {
-            for( k = 0; k < ksize; k++ )
-                ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
-        }
-        else
+        tab -= INTER_TAB_SIZE2*ksize*ksize;
+        itab -= INTER_TAB_SIZE2*ksize*ksize;
+#if CV_SSE2 || CV_NEON
+        if( method == INTER_LINEAR )
         {
-            for( k = 0; k < ksize; k++ )
-                beta[dy*ksize + k] = cbuf[k];
+            for( i = 0; i < INTER_TAB_SIZE2; i++ )
+                for( j = 0; j < 4; j++ )
+                {
+                    BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
+                    BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
+                    BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
+                    BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
+                }
         }
+#endif
+        inittab[method] = true;
     }
-
-    func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
-          fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
+    return fixpt ? (const void*)itab : (const void*)tab;
 }
 
-} // cv::hal::
-} // cv::
-
-//==================================================================================================
-
-void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
-                 double inv_scale_x, double inv_scale_y, int interpolation )
+#ifndef __MINGW32__
+static bool initAllInterTab2D()
 {
-    CV_INSTRUMENT_REGION()
+    return  initInterTab2D( INTER_LINEAR, false ) &&
+            initInterTab2D( INTER_LINEAR, true ) &&
+            initInterTab2D( INTER_CUBIC, false ) &&
+            initInterTab2D( INTER_CUBIC, true ) &&
+            initInterTab2D( INTER_LANCZOS4, false ) &&
+            initInterTab2D( INTER_LANCZOS4, true );
+}
 
-    Size ssize = _src.size();
+static volatile bool doInitAllInterTab2D = initAllInterTab2D();
+#endif
 
-    CV_Assert( ssize.width > 0 && ssize.height > 0 );
-    CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
-    if( dsize.area() == 0 )
-    {
-        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
-                     saturate_cast<int>(ssize.height*inv_scale_y));
-        CV_Assert( dsize.area() > 0 );
-    }
-    else
-    {
-        inv_scale_x = (double)dsize.width/ssize.width;
-        inv_scale_y = (double)dsize.height/ssize.height;
-    }
+template<typename ST, typename DT> struct Cast
+{
+    typedef ST type1;
+    typedef DT rtype;
 
-    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
-               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
+    DT operator()(ST val) const { return saturate_cast<DT>(val); }
+};
 
-    Mat src = _src.getMat();
-    _dst.create(dsize, src.type());
-    Mat dst = _dst.getMat();
+template<typename ST, typename DT, int bits> struct FixedPtCast
+{
+    typedef ST type1;
+    typedef DT rtype;
+    enum { SHIFT = bits, DELTA = 1 << (bits-1) };
 
-    if (dsize == ssize)
-    {
-        // Source and destination are of same size. Use simple copy.
-        src.copyTo(dst);
-        return;
-    }
+    DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
+};
 
-    hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
+static inline int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b-1) : a;
 }
 
-
 /****************************************************************************************\
 *                       General warping (affine, perspective, remap)                     *
 \****************************************************************************************/
 
-namespace cv
-{
-
 template<typename T>
 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                           int borderType, const Scalar& _borderValue )
@@ -6434,16 +3234,6 @@ cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
 }
 
-CV_IMPL void
-cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
-{
-    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
-    CV_Assert( src.type() == dst.type() );
-    cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
-        (double)dst.rows/src.rows, method );
-}
-
-
 CV_IMPL void
 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
               int flags, CvScalar fillval )
diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp
index ed1146dcd3..ef1f6646a2 100644
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@@ -56,8 +56,6 @@ namespace cv
 namespace opt_AVX2
 {
 #if CV_TRY_AVX2
-void resizeNN2_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
-void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
 int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
 #endif
 }
@@ -65,10 +63,6 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
 namespace opt_SSE4_1
 {
 #if CV_TRY_SSE4_1
-void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
-void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
-
-int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width);
 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
 void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
 void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
diff --git a/modules/imgproc/src/imgwarp.sse4_1.cpp b/modules/imgproc/src/imgwarp.sse4_1.cpp
index 49954b1dbc..c25967fcc9 100644
--- a/modules/imgproc/src/imgwarp.sse4_1.cpp
+++ b/modules/imgproc/src/imgwarp.sse4_1.cpp
@@ -55,179 +55,6 @@ namespace cv
 namespace opt_SSE4_1
 {
 
-class resizeNNInvokerSSE2 :
-    public ParallelLoopBody
-{
-public:
-    resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-        ify(_ify)
-    {
-    }
-
-#if defined(__INTEL_COMPILER)
-#pragma optimization_parameter target_arch=SSE4.2
-#endif
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int y, x;
-        int width = dsize.width;
-        int sseWidth = width - (width & 0x7);
-        for(y = range.start; y < range.end; y++)
-        {
-            uchar* D = dst.data + dst.step*y;
-            uchar* Dstart = D;
-            int sy = std::min(cvFloor(y*ify), ssize.height-1);
-            const uchar* S = src.data + sy*src.step;
-            __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
-            for(x = 0; x < sseWidth; x += 8)
-            {
-                ushort imm = *(ushort*)(S + x_ofs[x + 0]);
-                pixels = _mm_insert_epi16(pixels, imm, 0);
-                imm = *(ushort*)(S + x_ofs[x + 1]);
-                pixels = _mm_insert_epi16(pixels, imm, 1);
-                imm = *(ushort*)(S + x_ofs[x + 2]);
-                pixels = _mm_insert_epi16(pixels, imm, 2);
-                imm = *(ushort*)(S + x_ofs[x + 3]);
-                pixels = _mm_insert_epi16(pixels, imm, 3);
-                imm = *(ushort*)(S + x_ofs[x + 4]);
-                pixels = _mm_insert_epi16(pixels, imm, 4);
-                imm = *(ushort*)(S + x_ofs[x + 5]);
-                pixels = _mm_insert_epi16(pixels, imm, 5);
-                imm = *(ushort*)(S + x_ofs[x + 6]);
-                pixels = _mm_insert_epi16(pixels, imm, 6);
-                imm = *(ushort*)(S + x_ofs[x + 7]);
-                pixels = _mm_insert_epi16(pixels, imm, 7);
-                _mm_storeu_si128((__m128i*)D, pixels);
-                D += 16;
-            }
-            for(; x < width; x++)
-            {
-                *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
-            }
-        }
-    }
-
-private:
-    const Mat src;
-    Mat dst;
-    int* x_ofs, pix_size4;
-    double ify;
-
-    resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
-    resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
-};
-
-class resizeNNInvokerSSE4 :
-    public ParallelLoopBody
-{
-public:
-    resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-        ify(_ify)
-    {
-    }
-#if defined(__INTEL_COMPILER)
-#pragma optimization_parameter target_arch=SSE4.2
-#endif
-    virtual void operator() (const Range& range) const
-    {
-        Size ssize = src.size(), dsize = dst.size();
-        int y, x;
-        int width = dsize.width;
-        int sseWidth = width - (width & 0x3);
-        for(y = range.start; y < range.end; y++)
-        {
-            uchar* D = dst.data + dst.step*y;
-            uchar* Dstart = D;
-            int sy = std::min(cvFloor(y*ify), ssize.height-1);
-            const uchar* S = src.data + sy*src.step;
-            __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
-            for(x = 0; x < sseWidth; x += 4)
-            {
-                int imm = *(int*)(S + x_ofs[x + 0]);
-                pixels = _mm_insert_epi32(pixels, imm, 0);
-                imm = *(int*)(S + x_ofs[x + 1]);
-                pixels = _mm_insert_epi32(pixels, imm, 1);
-                imm = *(int*)(S + x_ofs[x + 2]);
-                pixels = _mm_insert_epi32(pixels, imm, 2);
-                imm = *(int*)(S + x_ofs[x + 3]);
-                pixels = _mm_insert_epi32(pixels, imm, 3);
-                _mm_storeu_si128((__m128i*)D, pixels);
-                D += 16;
-            }
-            for(; x < width; x++)
-            {
-                *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
-            }
-        }
-    }
-
-private:
-    const Mat src;
-    Mat dst;
-    int* x_ofs, pix_size4;
-    double ify;
-
-    resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
-    resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
-};
-
-void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
-{
-    resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
-}
-
-void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
-{
-    resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
-}
-
-int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width)
-{
-    const float** src = (const float**)_src;
-    const float* beta = (const float*)_beta;
-    const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-        *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-    short * dst = (short*)_dst;
-    int x = 0;
-    __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-        v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-        v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-        v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-
-    for (; x <= width - 8; x += 8)
-    {
-        __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-
-        __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
-        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
-
-        __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
-        __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
-
-        _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
-    }
-
-    return x;
-}
-
 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)
 {
     int x = 0;
diff --git a/modules/imgproc/src/resize.avx2.cpp b/modules/imgproc/src/resize.avx2.cpp
new file mode 100644
index 0000000000..0d934da169
--- /dev/null
+++ b/modules/imgproc/src/resize.avx2.cpp
@@ -0,0 +1,261 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "resize.hpp"
+
+namespace cv
+{
+namespace opt_AVX2
+{
+
+class resizeNNInvokerAVX4 :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+
+#if defined(__INTEL_COMPILER)
+#pragma optimization_parameter target_arch=AVX
+#endif
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int avxWidth = width - (width & 0x7);
+        const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
+                    __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
+                    _mm256_maskstore_epi32((int*)D, mask, pixels);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
+                    __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
+                    _mm256_storeu_si256((__m256i*)D, pixels);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+        _mm256_zeroupper();
+    }
+
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+
+    resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
+    resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
+};
+
+class resizeNNInvokerAVX2 :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+
+#if defined(__INTEL_COMPILER)
+#pragma optimization_parameter target_arch=AVX
+#endif
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        //int avxWidth = (width - 1) - ((width - 1) & 0x7);
+        int avxWidth = width - (width & 0xf);
+        const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
+        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
+                                                                         15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
+        const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+        //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
+        //                                                                       13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
+                    _mm256_maskstore_epi32((int*)D, mask, ints_permuted);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
+                    _mm256_storeu_si256((__m256i*)D, ints_permuted);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+            }
+        }
+        _mm256_zeroupper();
+    }
+
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+
+    resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
+    resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
+};
+
+void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
+{
+    resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
+{
+    resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+}
+}
+/* End of file. */
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
new file mode 100644
index 0000000000..13dbf3b795
--- /dev/null
+++ b/modules/imgproc/src/resize.cpp
@@ -0,0 +1,3330 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, 2017, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "opencl_kernels_imgproc.hpp"
+#include "hal_replacement.hpp"
+
+#include "opencv2/core/openvx/ovx_defs.hpp"
+#include "resize.hpp"
+
+using namespace cv;
+
+namespace cv
+{
+
+/************** interpolation formulas and tables ***************/
+
+const int INTER_RESIZE_COEF_BITS=11;
+const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
+
+static inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
+    coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
+    coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static inline void interpolateLanczos4( float x, float* coeffs )
+{
+    static const double s45 = 0.70710678118654752440084436210485;
+    static const double cs[][2]=
+    {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
+
+    if( x < FLT_EPSILON )
+    {
+        for( int i = 0; i < 8; i++ )
+            coeffs[i] = 0;
+        coeffs[3] = 1;
+        return;
+    }
+
+    float sum = 0;
+    double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
+    for(int i = 0; i < 8; i++ )
+    {
+        double y = -(x+3-i)*CV_PI*0.25;
+        coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
+        sum += coeffs[i];
+    }
+
+    sum = 1.f/sum;
+    for(int i = 0; i < 8; i++ )
+        coeffs[i] *= sum;
+}
+
+template<typename ST, typename DT> struct Cast
+{
+    typedef ST type1;
+    typedef DT rtype;
+
+    DT operator()(ST val) const { return saturate_cast<DT>(val); }
+};
+
+template<typename ST, typename DT, int bits> struct FixedPtCast
+{
+    typedef ST type1;
+    typedef DT rtype;
+    enum { SHIFT = bits, DELTA = 1 << (bits-1) };
+
+    DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
+};
+
+/****************************************************************************************\
+*                                         Resize                                         *
+\****************************************************************************************/
+
+class resizeNNInvoker :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x, pix_size = (int)src.elemSize();
+
+        for( y = range.start; y < range.end; y++ )
+        {
+            uchar* D = dst.data + dst.step*y;
+            int sy = std::min(cvFloor(y*ify), ssize.height-1);
+            const uchar* S = src.ptr(sy);
+
+            switch( pix_size )
+            {
+            case 1:
+                for( x = 0; x <= dsize.width - 2; x += 2 )
+                {
+                    uchar t0 = S[x_ofs[x]];
+                    uchar t1 = S[x_ofs[x+1]];
+                    D[x] = t0;
+                    D[x+1] = t1;
+                }
+
+                for( ; x < dsize.width; x++ )
+                    D[x] = S[x_ofs[x]];
+                break;
+            case 2:
+                for( x = 0; x < dsize.width; x++ )
+                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
+                break;
+            case 3:
+                for( x = 0; x < dsize.width; x++, D += 3 )
+                {
+                    const uchar* _tS = S + x_ofs[x];
+                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
+                }
+                break;
+            case 4:
+                for( x = 0; x < dsize.width; x++ )
+                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
+                break;
+            case 6:
+                for( x = 0; x < dsize.width; x++, D += 6 )
+                {
+                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
+                    ushort* _tD = (ushort*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+            case 8:
+                for( x = 0; x < dsize.width; x++, D += 8 )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
+                }
+                break;
+            case 12:
+                for( x = 0; x < dsize.width; x++, D += 12 )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+            default:
+                for( x = 0; x < dsize.width; x++, D += pix_size )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    for( int k = 0; k < pix_size4; k++ )
+                        _tD[k] = _tS[k];
+                }
+            }
+        }
+    }
+
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+
+    resizeNNInvoker(const resizeNNInvoker&);
+    resizeNNInvoker& operator=(const resizeNNInvoker&);
+};
+
+static void
+resizeNN( const Mat& src, Mat& dst, double fx, double fy )
+{
+    Size ssize = src.size(), dsize = dst.size();
+    AutoBuffer<int> _x_ofs(dsize.width);
+    int* x_ofs = _x_ofs;
+    int pix_size = (int)src.elemSize();
+    int pix_size4 = (int)(pix_size / sizeof(int));
+    double ifx = 1./fx, ify = 1./fy;
+    int x;
+
+    for( x = 0; x < dsize.width; x++ )
+    {
+        int sx = cvFloor(x*ifx);
+        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
+    }
+
+    Range range(0, dsize.height);
+#if CV_TRY_AVX2
+    if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4)))
+    {
+        if(pix_size == 2)
+            opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify);
+        else
+            opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify);
+    }
+    else
+#endif
+#if CV_TRY_SSE4_1
+    if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4)))
+    {
+        if(pix_size == 2)
+            opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
+        else
+            opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
+    }
+    else
+#endif
+    {
+        resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
+        parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+    }
+}
+
+
+struct VResizeNoVec
+{
+    int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
+};
+
+struct HResizeNoVec
+{
+    int operator()(const uchar**, uchar**, int, const int*,
+        const uchar*, int, int, int, int, int) const { return 0; }
+};
+
+#if CV_SSE2
+
+struct VResizeLinearVec_32s8u
+{
+    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE2) )
+            return 0;
+
+        const int** src = (const int**)_src;
+        const short* beta = (const short*)_beta;
+        const int *S0 = src[0], *S1 = src[1];
+        int x = 0;
+        __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
+        __m128i delta = _mm_set1_epi16(2);
+
+        if( (((size_t)S0|(size_t)S1)&15) == 0 )
+            for( ; x <= width - 16; x += 16 )
+            {
+                __m128i x0, x1, x2, y0, y1, y2;
+                x0 = _mm_load_si128((const __m128i*)(S0 + x));
+                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
+                y0 = _mm_load_si128((const __m128i*)(S1 + x));
+                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
+                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
+                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
+
+                x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
+                x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
+                y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
+                y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
+                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
+                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
+
+                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
+                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
+
+                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
+                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
+                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
+            }
+        else
+            for( ; x <= width - 16; x += 16 )
+            {
+                __m128i x0, x1, x2, y0, y1, y2;
+                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
+                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
+                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
+                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
+                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
+                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
+
+                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
+                x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
+                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
+                y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
+                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
+                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
+
+                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
+                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
+
+                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
+                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
+                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
+            }
+
+        for( ; x < width - 4; x += 4 )
+        {
+            __m128i x0, y0;
+            x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
+            y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
+            x0 = _mm_packs_epi32(x0, x0);
+            y0 = _mm_packs_epi32(y0, y0);
+            x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
+            x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
+            x0 = _mm_packus_epi16(x0, x0);
+            *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
+        }
+
+        return x;
+    }
+};
+
+
+template<int shiftval> struct VResizeLinearVec_32f16
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE2) )
+            return 0;
+
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+
+        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
+        __m128i preshift = _mm_set1_epi32(shiftval);
+        __m128i postshift = _mm_set1_epi16((short)shiftval);
+
+        if( (((size_t)S0|(size_t)S1)&15) == 0 )
+            for( ; x <= width - 16; x += 16 )
+            {
+                __m128 x0, x1, y0, y1;
+                __m128i t0, t1, t2;
+                x0 = _mm_load_ps(S0 + x);
+                x1 = _mm_load_ps(S0 + x + 4);
+                y0 = _mm_load_ps(S1 + x);
+                y1 = _mm_load_ps(S1 + x + 4);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
+                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
+                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
+
+                x0 = _mm_load_ps(S0 + x + 8);
+                x1 = _mm_load_ps(S0 + x + 12);
+                y0 = _mm_load_ps(S1 + x + 8);
+                y1 = _mm_load_ps(S1 + x + 12);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
+                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
+                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
+
+                _mm_storeu_si128( (__m128i*)(dst + x), t0);
+                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
+            }
+        else
+            for( ; x <= width - 16; x += 16 )
+            {
+                __m128 x0, x1, y0, y1;
+                __m128i t0, t1, t2;
+                x0 = _mm_loadu_ps(S0 + x);
+                x1 = _mm_loadu_ps(S0 + x + 4);
+                y0 = _mm_loadu_ps(S1 + x);
+                y1 = _mm_loadu_ps(S1 + x + 4);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
+                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
+                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
+
+                x0 = _mm_loadu_ps(S0 + x + 8);
+                x1 = _mm_loadu_ps(S0 + x + 12);
+                y0 = _mm_loadu_ps(S1 + x + 8);
+                y1 = _mm_loadu_ps(S1 + x + 12);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
+                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
+                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
+
+                _mm_storeu_si128( (__m128i*)(dst + x), t0);
+                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
+            }
+
+        for( ; x < width - 4; x += 4 )
+        {
+            __m128 x0, y0;
+            __m128i t0;
+            x0 = _mm_loadu_ps(S0 + x);
+            y0 = _mm_loadu_ps(S1 + x);
+
+            x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+            t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
+            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
+            _mm_storel_epi64( (__m128i*)(dst + x), t0);
+        }
+
+        return x;
+    }
+};
+
+typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
+typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
+
+struct VResizeLinearVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE) )
+            return 0;
+
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        float* dst = (float*)_dst;
+        int x = 0;
+
+        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
+
+        if( (((size_t)S0|(size_t)S1)&15) == 0 )
+            for( ; x <= width - 8; x += 8 )
+            {
+                __m128 x0, x1, y0, y1;
+                x0 = _mm_load_ps(S0 + x);
+                x1 = _mm_load_ps(S0 + x + 4);
+                y0 = _mm_load_ps(S1 + x);
+                y1 = _mm_load_ps(S1 + x + 4);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+
+                _mm_storeu_ps( dst + x, x0);
+                _mm_storeu_ps( dst + x + 4, x1);
+            }
+        else
+            for( ; x <= width - 8; x += 8 )
+            {
+                __m128 x0, x1, y0, y1;
+                x0 = _mm_loadu_ps(S0 + x);
+                x1 = _mm_loadu_ps(S0 + x + 4);
+                y0 = _mm_loadu_ps(S1 + x);
+                y1 = _mm_loadu_ps(S1 + x + 4);
+
+                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
+                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
+
+                _mm_storeu_ps( dst + x, x0);
+                _mm_storeu_ps( dst + x + 4, x1);
+            }
+
+        return x;
+    }
+};
+
+
+struct VResizeCubicVec_32s8u
+{
+    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE2) )
+            return 0;
+
+        const int** src = (const int**)_src;
+        const short* beta = (const short*)_beta;
+        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        int x = 0;
+        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
+        __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
+            b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
+
+        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
+            for( ; x <= width - 8; x += 8 )
+            {
+                __m128i x0, x1, y0, y1;
+                __m128 s0, s1, f0, f1;
+                x0 = _mm_load_si128((const __m128i*)(S0 + x));
+                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
+                y0 = _mm_load_si128((const __m128i*)(S1 + x));
+                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
+
+                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
+                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+
+                x0 = _mm_load_si128((const __m128i*)(S2 + x));
+                x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
+                y0 = _mm_load_si128((const __m128i*)(S3 + x));
+                y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
+
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+
+                x0 = _mm_cvtps_epi32(s0);
+                x1 = _mm_cvtps_epi32(s1);
+
+                x0 = _mm_packs_epi32(x0, x1);
+                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
+            }
+        else
+            for( ; x <= width - 8; x += 8 )
+            {
+                __m128i x0, x1, y0, y1;
+                __m128 s0, s1, f0, f1;
+                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
+                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
+                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
+                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
+
+                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
+                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+
+                x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
+                x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
+                y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
+                y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
+
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
+                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
+                s0 = _mm_add_ps(s0, f0);
+                s1 = _mm_add_ps(s1, f1);
+
+                x0 = _mm_cvtps_epi32(s0);
+                x1 = _mm_cvtps_epi32(s1);
+
+                x0 = _mm_packs_epi32(x0, x1);
+                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
+            }
+
+        return x;
+    }
+};
+
+
+template<int shiftval> struct VResizeCubicVec_32f16
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE2) )
+            return 0;
+
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
+            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
+        __m128i preshift = _mm_set1_epi32(shiftval);
+        __m128i postshift = _mm_set1_epi16((short)shiftval);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128 x0, x1, y0, y1, s0, s1;
+            __m128i t0, t1;
+            x0 = _mm_loadu_ps(S0 + x);
+            x1 = _mm_loadu_ps(S0 + x + 4);
+            y0 = _mm_loadu_ps(S1 + x);
+            y1 = _mm_loadu_ps(S1 + x + 4);
+
+            s0 = _mm_mul_ps(x0, b0);
+            s1 = _mm_mul_ps(x1, b0);
+            y0 = _mm_mul_ps(y0, b1);
+            y1 = _mm_mul_ps(y1, b1);
+            s0 = _mm_add_ps(s0, y0);
+            s1 = _mm_add_ps(s1, y1);
+
+            x0 = _mm_loadu_ps(S2 + x);
+            x1 = _mm_loadu_ps(S2 + x + 4);
+            y0 = _mm_loadu_ps(S3 + x);
+            y1 = _mm_loadu_ps(S3 + x + 4);
+
+            x0 = _mm_mul_ps(x0, b2);
+            x1 = _mm_mul_ps(x1, b2);
+            y0 = _mm_mul_ps(y0, b3);
+            y1 = _mm_mul_ps(y1, b3);
+            s0 = _mm_add_ps(s0, x0);
+            s1 = _mm_add_ps(s1, x1);
+            s0 = _mm_add_ps(s0, y0);
+            s1 = _mm_add_ps(s1, y1);
+
+            t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
+            t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
+
+            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
+            _mm_storeu_si128( (__m128i*)(dst + x), t0);
+        }
+
+        return x;
+    }
+};
+
+typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
+typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
+
+struct VResizeCubicVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if( !checkHardwareSupport(CV_CPU_SSE) )
+            return 0;
+
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        float* dst = (float*)_dst;
+        int x = 0;
+        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
+            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128 x0, x1, y0, y1, s0, s1;
+            x0 = _mm_loadu_ps(S0 + x);
+            x1 = _mm_loadu_ps(S0 + x + 4);
+            y0 = _mm_loadu_ps(S1 + x);
+            y1 = _mm_loadu_ps(S1 + x + 4);
+
+            s0 = _mm_mul_ps(x0, b0);
+            s1 = _mm_mul_ps(x1, b0);
+            y0 = _mm_mul_ps(y0, b1);
+            y1 = _mm_mul_ps(y1, b1);
+            s0 = _mm_add_ps(s0, y0);
+            s1 = _mm_add_ps(s1, y1);
+
+            x0 = _mm_loadu_ps(S2 + x);
+            x1 = _mm_loadu_ps(S2 + x + 4);
+            y0 = _mm_loadu_ps(S3 + x);
+            y1 = _mm_loadu_ps(S3 + x + 4);
+
+            x0 = _mm_mul_ps(x0, b2);
+            x1 = _mm_mul_ps(x1, b2);
+            y0 = _mm_mul_ps(y0, b3);
+            y1 = _mm_mul_ps(y1, b3);
+            s0 = _mm_add_ps(s0, x0);
+            s1 = _mm_add_ps(s1, x1);
+            s0 = _mm_add_ps(s0, y0);
+            s1 = _mm_add_ps(s1, y1);
+
+            _mm_storeu_ps( dst + x, s0);
+            _mm_storeu_ps( dst + x + 4, s1);
+        }
+
+        return x;
+    }
+};
+
+#if CV_TRY_SSE4_1
+
+struct VResizeLanczos4Vec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
+        else return 0;
+    }
+};
+
+#else
+
+typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
+
+#endif
+
+struct VResizeLanczos4Vec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+        short * dst = (short*)_dst;
+        int x = 0;
+        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
+               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
+               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
+               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
+            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
+
+            __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
+            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
+
+            __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
+            __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
+        }
+
+        return x;
+    }
+};
+
+
+struct VResizeLanczos4Vec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+        float* dst = (float*)_dst;
+        int x = 0;
+
+        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
+               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
+               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
+               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
+            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
+
+            _mm_storeu_ps(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+
+#elif CV_NEON
+
+struct VResizeLinearVec_32s8u
+{
+    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
+    {
+        const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
+        const short* beta = (const short*)_beta;
+        int x = 0;
+        int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
+
+        for( ; x <= width - 16; x += 16)
+        {
+            int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
+            int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
+
+            int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
+            int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
+
+            int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
+                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
+            v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
+
+            v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
+            v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
+            v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
+            v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
+
+            v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
+            v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
+
+            int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
+                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
+            v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
+
+            vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLinearVec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLinearVec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        short* dst = (short*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLinearVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        float* dst = (float*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
+            vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
+        }
+
+        return x;
+    }
+};
+
+typedef VResizeNoVec VResizeCubicVec_32s8u;
+
+struct VResizeCubicVec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                         v_b1, vld1q_f32(S1 + x + 4)),
+                                                                         v_b2, vld1q_f32(S2 + x + 4)),
+                                                                         v_b3, vld1q_f32(S3 + x + 4));
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeCubicVec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        short* dst = (short*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                         v_b1, vld1q_f32(S1 + x + 4)),
+                                                                         v_b2, vld1q_f32(S2 + x + 4)),
+                                                                         v_b3, vld1q_f32(S3 + x + 4));
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeCubicVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        float* dst = (float*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                       v_b1, vld1q_f32(S1 + x)),
+                                                                       v_b2, vld1q_f32(S2 + x)),
+                                                                       v_b3, vld1q_f32(S3 + x)));
+            vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                          v_b1, vld1q_f32(S1 + x + 4)),
+                                                                          v_b2, vld1q_f32(S2 + x + 4)),
+                                                                          v_b3, vld1q_f32(S3 + x + 4)));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLanczos4Vec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+        ushort * dst = (ushort*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
+                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
+                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+                                                                         v_b5, vld1q_f32(S5 + x)),
+                                                                         v_b6, vld1q_f32(S6 + x)),
+                                                                         v_b7, vld1q_f32(S7 + x));
+            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
+
+            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                             v_b1, vld1q_f32(S1 + x + 4)),
+                                                             v_b2, vld1q_f32(S2 + x + 4)),
+                                                             v_b3, vld1q_f32(S3 + x + 4));
+            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
+                                                             v_b5, vld1q_f32(S5 + x + 4)),
+                                                             v_b6, vld1q_f32(S6 + x + 4)),
+                                                             v_b7, vld1q_f32(S7 + x + 4));
+            v_dst1 = vaddq_f32(v_dst0, v_dst1);
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLanczos4Vec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+        short * dst = (short*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
+                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
+                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+                                                                         v_b5, vld1q_f32(S5 + x)),
+                                                                         v_b6, vld1q_f32(S6 + x)),
+                                                                         v_b7, vld1q_f32(S7 + x));
+            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
+
+            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                             v_b1, vld1q_f32(S1 + x + 4)),
+                                                             v_b2, vld1q_f32(S2 + x + 4)),
+                                                             v_b3, vld1q_f32(S3 + x + 4));
+            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
+                                                             v_b5, vld1q_f32(S5 + x + 4)),
+                                                             v_b6, vld1q_f32(S6 + x + 4)),
+                                                             v_b7, vld1q_f32(S7 + x + 4));
+            v_dst1 = vaddq_f32(v_dst0, v_dst1);
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLanczos4Vec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+        float* dst = (float*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
+                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
+                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+                                                                         v_b5, vld1q_f32(S5 + x)),
+                                                                         v_b6, vld1q_f32(S6 + x)),
+                                                                         v_b7, vld1q_f32(S7 + x));
+            vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
+        }
+
+        return x;
+    }
+};
+
+#else
+
+typedef VResizeNoVec VResizeLinearVec_32s8u;
+typedef VResizeNoVec VResizeLinearVec_32f16u;
+typedef VResizeNoVec VResizeLinearVec_32f16s;
+typedef VResizeNoVec VResizeLinearVec_32f;
+
+typedef VResizeNoVec VResizeCubicVec_32s8u;
+typedef VResizeNoVec VResizeCubicVec_32f16u;
+typedef VResizeNoVec VResizeCubicVec_32f16s;
+typedef VResizeNoVec VResizeCubicVec_32f;
+
+typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
+typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
+typedef VResizeNoVec VResizeLanczos4Vec_32f;
+
+#endif
+
+typedef HResizeNoVec HResizeLinearVec_8u32s;
+typedef HResizeNoVec HResizeLinearVec_16u32f;
+typedef HResizeNoVec HResizeLinearVec_16s32f;
+typedef HResizeNoVec HResizeLinearVec_32f;
+typedef HResizeNoVec HResizeLinearVec_64f;
+
+
+template<typename T, typename WT, typename AT, int ONE, class VecOp>
+struct HResizeLinear
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const T** src, WT** dst, int count,
+                    const int* xofs, const AT* alpha,
+                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
+    {
+        int dx, k;
+        VecOp vecOp;
+
+        int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
+            xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
+
+        for( k = 0; k <= count - 2; k++ )
+        {
+            const T *S0 = src[k], *S1 = src[k+1];
+            WT *D0 = dst[k], *D1 = dst[k+1];
+            for( dx = dx0; dx < xmax; dx++ )
+            {
+                int sx = xofs[dx];
+                WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
+                WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
+                WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
+                D0[dx] = t0; D1[dx] = t1;
+            }
+
+            for( ; dx < dwidth; dx++ )
+            {
+                int sx = xofs[dx];
+                D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
+            }
+        }
+
+        for( ; k < count; k++ )
+        {
+            const T *S = src[k];
+            WT *D = dst[k];
+            for( dx = 0; dx < xmax; dx++ )
+            {
+                int sx = xofs[dx];
+                D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
+            }
+
+            for( ; dx < dwidth; dx++ )
+                D[dx] = WT(S[xofs[dx]]*ONE);
+        }
+    }
+};
+
+
+template<typename T, typename WT, typename AT, class CastOp, class VecOp>
+struct VResizeLinear
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
+    {
+        WT b0 = beta[0], b1 = beta[1];
+        const WT *S0 = src[0], *S1 = src[1];
+        CastOp castOp;
+        VecOp vecOp;
+
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            WT t0, t1;
+            t0 = S0[x]*b0 + S1[x]*b1;
+            t1 = S0[x+1]*b0 + S1[x+1]*b1;
+            dst[x] = castOp(t0); dst[x+1] = castOp(t1);
+            t0 = S0[x+2]*b0 + S1[x+2]*b1;
+            t1 = S0[x+3]*b0 + S1[x+3]*b1;
+            dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
+        }
+        #endif
+        for( ; x < width; x++ )
+            dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
+    }
+};
+
+template<>
+struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
+{
+    typedef uchar value_type;
+    typedef int buf_type;
+    typedef short alpha_type;
+
+    void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
+    {
+        alpha_type b0 = beta[0], b1 = beta[1];
+        const buf_type *S0 = src[0], *S1 = src[1];
+        VResizeLinearVec_32s8u vecOp;
+
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
+            dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
+            dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
+            dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
+        }
+        #endif
+        for( ; x < width; x++ )
+            dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
+    }
+};
+
+
+template<typename T, typename WT, typename AT>
+struct HResizeCubic
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const T** src, WT** dst, int count,
+                    const int* xofs, const AT* alpha,
+                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
+    {
+        for( int k = 0; k < count; k++ )
+        {
+            const T *S = src[k];
+            WT *D = dst[k];
+            int dx = 0, limit = xmin;
+            for(;;)
+            {
+                for( ; dx < limit; dx++, alpha += 4 )
+                {
+                    int j, sx = xofs[dx] - cn;
+                    WT v = 0;
+                    for( j = 0; j < 4; j++ )
+                    {
+                        int sxj = sx + j*cn;
+                        if( (unsigned)sxj >= (unsigned)swidth )
+                        {
+                            while( sxj < 0 )
+                                sxj += cn;
+                            while( sxj >= swidth )
+                                sxj -= cn;
+                        }
+                        v += S[sxj]*alpha[j];
+                    }
+                    D[dx] = v;
+                }
+                if( limit == dwidth )
+                    break;
+                for( ; dx < xmax; dx++, alpha += 4 )
+                {
+                    int sx = xofs[dx];
+                    D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
+                        S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
+                }
+                limit = dwidth;
+            }
+            alpha -= dwidth*4;
+        }
+    }
+};
+
+
+template<typename T, typename WT, typename AT, class CastOp, class VecOp>
+struct VResizeCubic
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
+    {
+        WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+        const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        CastOp castOp;
+        VecOp vecOp;
+
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        for( ; x < width; x++ )
+            dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
+    }
+};
+
+
+template<typename T, typename WT, typename AT>
+struct HResizeLanczos4
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const T** src, WT** dst, int count,
+                    const int* xofs, const AT* alpha,
+                    int swidth, int dwidth, int cn, int xmin, int xmax ) const
+    {
+        for( int k = 0; k < count; k++ )
+        {
+            const T *S = src[k];
+            WT *D = dst[k];
+            int dx = 0, limit = xmin;
+            for(;;)
+            {
+                for( ; dx < limit; dx++, alpha += 8 )
+                {
+                    int j, sx = xofs[dx] - cn*3;
+                    WT v = 0;
+                    for( j = 0; j < 8; j++ )
+                    {
+                        int sxj = sx + j*cn;
+                        if( (unsigned)sxj >= (unsigned)swidth )
+                        {
+                            while( sxj < 0 )
+                                sxj += cn;
+                            while( sxj >= swidth )
+                                sxj -= cn;
+                        }
+                        v += S[sxj]*alpha[j];
+                    }
+                    D[dx] = v;
+                }
+                if( limit == dwidth )
+                    break;
+                for( ; dx < xmax; dx++, alpha += 8 )
+                {
+                    int sx = xofs[dx];
+                    D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
+                        S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
+                        S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
+                        S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
+                }
+                limit = dwidth;
+            }
+            alpha -= dwidth*8;
+        }
+    }
+};
+
+
+template<typename T, typename WT, typename AT, class CastOp, class VecOp>
+struct VResizeLanczos4
+{
+    typedef T value_type;
+    typedef WT buf_type;
+    typedef AT alpha_type;
+
+    void operator()(const WT** src, T* dst, const AT* beta, int width ) const
+    {
+        CastOp castOp;
+        VecOp vecOp;
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            WT b = beta[0];
+            const WT* S = src[0];
+            WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
+
+            for( int k = 1; k < 8; k++ )
+            {
+                b = beta[k]; S = src[k];
+                s0 += S[x]*b; s1 += S[x+1]*b;
+                s2 += S[x+2]*b; s3 += S[x+3]*b;
+            }
+
+            dst[x] = castOp(s0); dst[x+1] = castOp(s1);
+            dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
+        }
+        #endif
+        for( ; x < width; x++ )
+        {
+            dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
+                src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
+                src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
+        }
+    }
+};
+
+
+static inline int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b-1) : a;
+}
+
+static const int MAX_ESIZE=16;
+
+template <typename HResize, typename VResize>
+class resizeGeneric_Invoker :
+    public ParallelLoopBody
+{
+public:
+    typedef typename HResize::value_type T;
+    typedef typename HResize::buf_type WT;
+    typedef typename HResize::alpha_type AT;
+
+    resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
+        const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
+        int _ksize, int _xmin, int _xmax) :
+        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
+        alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
+        ksize(_ksize), xmin(_xmin), xmax(_xmax)
+    {
+        CV_Assert(ksize <= MAX_ESIZE);
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        int dy, cn = src.channels();
+        HResize hresize;
+        VResize vresize;
+
+        int bufstep = (int)alignSize(dsize.width, 16);
+        AutoBuffer<WT> _buffer(bufstep*ksize);
+        const T* srows[MAX_ESIZE]={0};
+        WT* rows[MAX_ESIZE]={0};
+        int prev_sy[MAX_ESIZE];
+
+        for(int k = 0; k < ksize; k++ )
+        {
+            prev_sy[k] = -1;
+            rows[k] = (WT*)_buffer + bufstep*k;
+        }
+
+        const AT* beta = _beta + ksize * range.start;
+
+        for( dy = range.start; dy < range.end; dy++, beta += ksize )
+        {
+            int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
+
+            for(int k = 0; k < ksize; k++ )
+            {
+                int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
+                for( k1 = std::max(k1, k); k1 < ksize; k1++ )
+                {
+                    if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
+                    {
+                        if( k1 > k )
+                            memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
+                        break;
+                    }
+                }
+                if( k1 == ksize )
+                    k0 = std::min(k0, k); // remember the first row that needs to be computed
+                srows[k] = src.template ptr<T>(sy);
+                prev_sy[k] = sy;
+            }
+
+            if( k0 < ksize )
+                hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
+                        ssize.width, dsize.width, cn, xmin, xmax );
+            vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    const int* xofs, *yofs;
+    const AT* alpha, *_beta;
+    Size ssize, dsize;
+    const int ksize, xmin, xmax;
+
+    resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
+};
+
+template<class HResize, class VResize>
+static void resizeGeneric_( const Mat& src, Mat& dst,
+                            const int* xofs, const void* _alpha,
+                            const int* yofs, const void* _beta,
+                            int xmin, int xmax, int ksize )
+{
+    typedef typename HResize::alpha_type AT;
+
+    const AT* beta = (const AT*)_beta;
+    Size ssize = src.size(), dsize = dst.size();
+    int cn = src.channels();
+    ssize.width *= cn;
+    dsize.width *= cn;
+    xmin *= cn;
+    xmax *= cn;
+    // image resize is a separable operation. In case of not too strong
+
+    Range range(0, dsize.height);
+    resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
+        ssize, dsize, ksize, xmin, xmax);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+}
+
+template <typename T, typename WT>
+struct ResizeAreaFastNoVec
+{
+    ResizeAreaFastNoVec(int, int) { }
+    ResizeAreaFastNoVec(int, int, int, int) { }
+    int operator() (const T*, T*, int) const
+    { return 0; }
+};
+
+#if CV_NEON
+
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+    }
+
+    int operator() (const uchar* S, uchar* D, int w) const
+    {
+        int dx = 0;
+        const uchar* S0 = S, * S1 = S0 + step;
+
+        uint16x8_t v_2 = vdupq_n_u16(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
+            {
+                uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
+
+                uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
+                v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
+                v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
+
+                uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
+                v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
+                v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
+
+                vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
+
+                uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
+                uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
+                uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
+                uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
+
+                uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
+                                           vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
+                uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
+                                           vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
+                uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
+
+                vst1_u8(D, vmovn_u16(v_dst));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn, step;
+};
+
+class ResizeAreaFastVec_SIMD_16u
+{
+public:
+    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+    }
+
+    int operator() (const ushort * S, ushort * D, int w) const
+    {
+        int dx = 0;
+        const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
+
+        uint32x4_t v_2 = vdupq_n_u32(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
+
+                uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
+                v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
+
+                uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
+                v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
+
+                vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
+                uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
+                                             vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
+                vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn, step;
+};
+
+class ResizeAreaFastVec_SIMD_16s
+{
+public:
+    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+    }
+
+    int operator() (const short * S, short * D, int w) const
+    {
+        int dx = 0;
+        const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
+
+        int32x4_t v_2 = vdupq_n_s32(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
+
+                int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
+                v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
+                v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
+
+                int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
+                v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
+                v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
+
+                vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
+                int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
+                                            vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
+                vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn, step;
+};
+
+struct ResizeAreaFastVec_SIMD_32f
+{
+    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
+    }
+
+    int operator() (const float * S, float * D, int w) const
+    {
+        if (!fast_mode)
+            return 0;
+
+        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
+        int dx = 0;
+
+        float32x4_t v_025 = vdupq_n_f32(0.25f);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
+
+                float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
+                float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
+
+                vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
+                float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
+
+                vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    bool fast_mode;
+    int step;
+};
+
+#elif CV_SSE2
+
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const uchar* S, uchar* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const uchar* S0 = S;
+        const uchar* S1 = S0 + step;
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi16(2);
+
+        if (cn == 1)
+        {
+            __m128i masklow = _mm_set1_epi16(0x00ff);
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)(D+3), s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            int v[] = { 0, 0, -1, -1 };
+            __m128i mask = _mm_loadu_si128((const __m128i*)v);
+
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                __m128i res0 = _mm_srli_epi16(s0, 2);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                __m128i res1 = _mm_srli_epi16(s0, 2);
+                s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
+                                                   _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
+                _mm_storel_epi64((__m128i*)(D), s0);
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    bool use_simd;
+    int step;
+};
+
+class ResizeAreaFastVec_SIMD_16u
+{
+public:
+    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const ushort* S, ushort* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const ushort* S0 = (const ushort*)S;
+        const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
+        __m128i masklow = _mm_set1_epi32(0x0000ffff);
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi32(2);
+
+#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
+                s0 = _mm_srli_epi32(s0, 2);
+                s0 = _mm_packus_epi32(s0, zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
+                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
+                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
+                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
+                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
+
+                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+
+#undef _mm_packus_epi32
+
+        return dx;
+    }
+
+private:
+    int cn;
+    int step;
+    bool use_simd;
+};
+
+class ResizeAreaFastVec_SIMD_16s
+{
+public:
+    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short* S, short* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const short* S0 = (const short*)S;
+        const short* S1 = (const short*)((const uchar*)(S) + step);
+        __m128i masklow = _mm_set1_epi32(0x0000ffff);
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi32(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
+                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
+                __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
+                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
+                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
+                s0 = _mm_srai_epi32(s0, 2);
+                s0 = _mm_packs_epi32(s0, zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
+                __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
+
+                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
+                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
+                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
+                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
+                __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
+
+                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    int step;
+    bool use_simd;
+};
+
+struct ResizeAreaFastVec_SIMD_32f
+{
+    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
+        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const float * S, float * D, int w) const
+    {
+        if (!fast_mode)
+            return 0;
+
+        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
+        int dx = 0;
+
+        __m128 v_025 = _mm_set1_ps(0.25f);
+
+        if (cn == 1)
+        {
+            const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
+                       v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
+
+                __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
+                                           _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
+                __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
+                                           _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
+
+                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
+                __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
+
+                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    bool fast_mode;
+    int step;
+};
+
+#else
+
+typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
+typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
+typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
+
+#endif
+
+template<typename T, typename SIMDVecOp>
+struct ResizeAreaFastVec
+{
+    ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
+        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
+    {
+        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
+    }
+
+    int operator() (const T* S, T* D, int w) const
+    {
+        if (!fast_mode)
+            return 0;
+
+        const T* nextS = (const T*)((const uchar*)S + step);
+        int dx = vecOp(S, D, w);
+
+        if (cn == 1)
+            for( ; dx < w; ++dx )
+            {
+                int index = dx*2;
+                D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
+            }
+        else if (cn == 3)
+            for( ; dx < w; dx += 3 )
+            {
+                int index = dx*2;
+                D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
+                D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
+                D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
+            }
+        else
+            {
+                CV_Assert(cn == 4);
+                for( ; dx < w; dx += 4 )
+                {
+                    int index = dx*2;
+                    D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
+                    D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
+                    D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
+                    D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
+                }
+            }
+
+        return dx;
+    }
+
+private:
+    int scale_x, scale_y;
+    int cn;
+    bool fast_mode;
+    int step;
+    SIMDVecOp vecOp;
+};
+
+template <typename T, typename WT, typename VecOp>
+class resizeAreaFast_Invoker :
+    public ParallelLoopBody
+{
+public:
+    resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
+        int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
+        ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
+        scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
+    {
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int cn = src.channels();
+        int area = scale_x*scale_y;
+        float scale = 1.f/(area);
+        int dwidth1 = (ssize.width/scale_x)*cn;
+        dsize.width *= cn;
+        ssize.width *= cn;
+        int dy, dx, k = 0;
+
+        VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
+
+        for( dy = range.start; dy < range.end; dy++ )
+        {
+            T* D = (T*)(dst.data + dst.step*dy);
+            int sy0 = dy*scale_y;
+            int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
+
+            if( sy0 >= ssize.height )
+            {
+                for( dx = 0; dx < dsize.width; dx++ )
+                    D[dx] = 0;
+                continue;
+            }
+
+            dx = vop(src.template ptr<T>(sy0), D, w);
+            for( ; dx < w; dx++ )
+            {
+                const T* S = src.template ptr<T>(sy0) + xofs[dx];
+                WT sum = 0;
+                k = 0;
+                #if CV_ENABLE_UNROLLED
+                for( ; k <= area - 4; k += 4 )
+                    sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
+                #endif
+                for( ; k < area; k++ )
+                    sum += S[ofs[k]];
+
+                D[dx] = saturate_cast<T>(sum * scale);
+            }
+
+            for( ; dx < dsize.width; dx++ )
+            {
+                WT sum = 0;
+                int count = 0, sx0 = xofs[dx];
+                if( sx0 >= ssize.width )
+                    D[dx] = 0;
+
+                for( int sy = 0; sy < scale_y; sy++ )
+                {
+                    if( sy0 + sy >= ssize.height )
+                        break;
+                    const T* S = src.template ptr<T>(sy0 + sy) + sx0;
+                    for( int sx = 0; sx < scale_x*cn; sx += cn )
+                    {
+                        if( sx0 + sx >= ssize.width )
+                            break;
+                        sum += S[sx];
+                        count++;
+                    }
+                }
+
+                D[dx] = saturate_cast<T>((float)sum/count);
+            }
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    int scale_x, scale_y;
+    const int *ofs, *xofs;
+};
+
+template<typename T, typename WT, typename VecOp>
+static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
+                             int scale_x, int scale_y )
+{
+    Range range(0, dst.rows);
+    resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
+        scale_y, ofs, xofs);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+}
+
+struct DecimateAlpha
+{
+    int si, di;
+    float alpha;
+};
+
+
+template<typename T, typename WT> class ResizeArea_Invoker :
+    public ParallelLoopBody
+{
+public:
+    ResizeArea_Invoker( const Mat& _src, Mat& _dst,
+                        const DecimateAlpha* _xtab, int _xtab_size,
+                        const DecimateAlpha* _ytab, int _ytab_size,
+                        const int* _tabofs )
+    {
+        src = &_src;
+        dst = &_dst;
+        xtab0 = _xtab;
+        xtab_size0 = _xtab_size;
+        ytab = _ytab;
+        ytab_size = _ytab_size;
+        tabofs = _tabofs;
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        Size dsize = dst->size();
+        int cn = dst->channels();
+        dsize.width *= cn;
+        AutoBuffer<WT> _buffer(dsize.width*2);
+        const DecimateAlpha* xtab = xtab0;
+        int xtab_size = xtab_size0;
+        WT *buf = _buffer, *sum = buf + dsize.width;
+        int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
+
+        for( dx = 0; dx < dsize.width; dx++ )
+            sum[dx] = (WT)0;
+
+        for( j = j_start; j < j_end; j++ )
+        {
+            WT beta = ytab[j].alpha;
+            int dy = ytab[j].di;
+            int sy = ytab[j].si;
+
+            {
+                const T* S = src->template ptr<T>(sy);
+                for( dx = 0; dx < dsize.width; dx++ )
+                    buf[dx] = (WT)0;
+
+                if( cn == 1 )
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        buf[dxn] += S[xtab[k].si]*alpha;
+                    }
+                else if( cn == 2 )
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        WT t0 = buf[dxn] + S[sxn]*alpha;
+                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                        buf[dxn] = t0; buf[dxn+1] = t1;
+                    }
+                else if( cn == 3 )
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        WT t0 = buf[dxn] + S[sxn]*alpha;
+                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                        WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
+                        buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
+                    }
+                else if( cn == 4 )
+                {
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        WT t0 = buf[dxn] + S[sxn]*alpha;
+                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                        buf[dxn] = t0; buf[dxn+1] = t1;
+                        t0 = buf[dxn+2] + S[sxn+2]*alpha;
+                        t1 = buf[dxn+3] + S[sxn+3]*alpha;
+                        buf[dxn+2] = t0; buf[dxn+3] = t1;
+                    }
+                }
+                else
+                {
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        for( int c = 0; c < cn; c++ )
+                            buf[dxn + c] += S[sxn + c]*alpha;
+                    }
+                }
+            }
+
+            if( dy != prev_dy )
+            {
+                T* D = dst->template ptr<T>(prev_dy);
+
+                for( dx = 0; dx < dsize.width; dx++ )
+                {
+                    D[dx] = saturate_cast<T>(sum[dx]);
+                    sum[dx] = beta*buf[dx];
+                }
+                prev_dy = dy;
+            }
+            else
+            {
+                for( dx = 0; dx < dsize.width; dx++ )
+                    sum[dx] += beta*buf[dx];
+            }
+        }
+
+        {
+        T* D = dst->template ptr<T>(prev_dy);
+        for( dx = 0; dx < dsize.width; dx++ )
+            D[dx] = saturate_cast<T>(sum[dx]);
+        }
+    }
+
+private:
+    const Mat* src;
+    Mat* dst;
+    const DecimateAlpha* xtab0;
+    const DecimateAlpha* ytab;
+    int xtab_size0, ytab_size;
+    const int* tabofs;
+};
+
+
+template <typename T, typename WT>
+static void resizeArea_( const Mat& src, Mat& dst,
+                         const DecimateAlpha* xtab, int xtab_size,
+                         const DecimateAlpha* ytab, int ytab_size,
+                         const int* tabofs )
+{
+    parallel_for_(Range(0, dst.rows),
+                 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
+                 dst.total()/((double)(1 << 16)));
+}
+
+
+typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
+                            const int* xofs, const void* alpha,
+                            const int* yofs, const void* beta,
+                            int xmin, int xmax, int ksize );
+
+typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
+                                    const int* ofs, const int *xofs,
+                                    int scale_x, int scale_y );
+
+typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
+                                const DecimateAlpha* xtab, int xtab_size,
+                                const DecimateAlpha* ytab, int ytab_size,
+                                const int* yofs);
+
+
+static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
+{
+    int k = 0;
+    for(int dx = 0; dx < dsize; dx++ )
+    {
+        double fsx1 = dx * scale;
+        double fsx2 = fsx1 + scale;
+        double cellWidth = std::min(scale, ssize - fsx1);
+
+        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+
+        sx2 = std::min(sx2, ssize - 1);
+        sx1 = std::min(sx1, sx2);
+
+        if( sx1 - fsx1 > 1e-3 )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = (sx1 - 1) * cn;
+            tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
+        }
+
+        for(int sx = sx1; sx < sx2; sx++ )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = sx * cn;
+            tab[k++].alpha = float(1.0 / cellWidth);
+        }
+
+        if( fsx2 - sx2 > 1e-3 )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = sx2 * cn;
+            tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+        }
+    }
+    return k;
+}
+
+#ifdef HAVE_OPENCL
+static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
+                                      float * const alpha_tab, int * const ofs_tab)
+{
+    int k = 0, dx = 0;
+    for ( ; dx < dsize; dx++)
+    {
+        ofs_tab[dx] = k;
+
+        double fsx1 = dx * scale;
+        double fsx2 = fsx1 + scale;
+        double cellWidth = std::min(scale, ssize - fsx1);
+
+        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+
+        sx2 = std::min(sx2, ssize - 1);
+        sx1 = std::min(sx1, sx2);
+
+        if (sx1 - fsx1 > 1e-3)
+        {
+            map_tab[k] = sx1 - 1;
+            alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
+        }
+
+        for (int sx = sx1; sx < sx2; sx++)
+        {
+            map_tab[k] = sx;
+            alpha_tab[k++] = float(1.0 / cellWidth);
+        }
+
+        if (fsx2 - sx2 > 1e-3)
+        {
+            map_tab[k] = sx2;
+            alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+        }
+    }
+    ofs_tab[dx] = k;
+}
+
+static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
+                        double fx, double fy, int interpolation)
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
+    float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+    int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
+    bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+
+    // in case of scale_x && scale_y is equal to 2
+    // INTER_AREA (fast) also is equal to INTER_LINEAR
+    if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
+        /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
+
+    if( !(cn <= 4 &&
+           (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
+            (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(dsize, type);
+    UMat dst = _dst.getUMat();
+
+    Size ssize = src.size();
+    ocl::Kernel k;
+    size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
+
+    ocl::Image2D srcImage;
+
+    // See if this could be done with a sampler.  We stick with integer
+    // datatypes because the observed error is low.
+    bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
+                       ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
+                       ocl::Image2D::isFormatSupported(depth, cn, true) &&
+                       src.offset==0);
+    if (useSampler)
+    {
+        int wdepth = std::max(depth, CV_32S);
+        char buf[2][32];
+        cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
+                        "-D convertToDT=%s -D cn=%d",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                        cn);
+        k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
+
+        if (k.empty())
+            useSampler = false;
+        else
+        {
+            // Convert the input into an OpenCL image type, using normalized channel data types
+            // and aliasing the UMat.
+            srcImage = ocl::Image2D(src, true, true);
+            k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
+    }
+
+    if (interpolation == INTER_LINEAR && !useSampler)
+    {
+        char buf[2][32];
+
+        // integer path is slower because of CPU part, so it's disabled
+        if (depth == CV_8U && ((void)0, 0))
+        {
+            AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
+            int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
+            short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
+            float fxx, fyy;
+            int sx, sy;
+
+            for (int dx = 0; dx < dsize.width; dx++)
+            {
+                fxx = (float)((dx+0.5)*inv_fx - 0.5);
+                sx = cvFloor(fxx);
+                fxx -= sx;
+
+                if (sx < 0)
+                    fxx = 0, sx = 0;
+
+                if (sx >= ssize.width-1)
+                    fxx = 0, sx = ssize.width-1;
+
+                xofs[dx] = sx;
+                ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+                ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            for (int dy = 0; dy < dsize.height; dy++)
+            {
+                fyy = (float)((dy+0.5)*inv_fy - 0.5);
+                sy = cvFloor(fyy);
+                fyy -= sy;
+
+                yofs[dy] = sy;
+                ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
+                ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            UMat coeffs;
+            Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
+
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   ocl::KernelArg::PtrReadOnly(coeffs));
+        }
+        else
+        {
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
+    }
+    else if (interpolation == INTER_NEAREST)
+    {
+        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
+                        ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
+        if (k.empty())
+            return false;
+
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+               (float)inv_fx, (float)inv_fy);
+    }
+    else if (interpolation == INTER_AREA)
+    {
+        int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
+        int wtype = CV_MAKE_TYPE(wdepth, cn);
+
+        char cvt[2][40];
+        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+                                    ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
+
+        UMat alphaOcl, tabofsOcl, mapOcl;
+        UMat dmap, smap;
+
+        if (is_area_fast)
+        {
+            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
+            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+                                                " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
+                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
+                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
+                                    iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+
+            k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
+            if (k.empty())
+                return false;
+        }
+        else
+        {
+            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
+            if (k.empty())
+                return false;
+
+            int xytab_size = (ssize.width + ssize.height) << 1;
+            int tabofs_size = dsize.height + dsize.width + 2;
+
+            AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
+            AutoBuffer<float> _xyalpha_tab(xytab_size);
+            int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
+            float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
+            int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
+
+            ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
+            ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
+
+            // loading precomputed arrays to GPU
+            Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
+            Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
+            Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
+        }
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
+
+        if (is_area_fast)
+            k.args(srcarg, dstarg);
+        else
+            k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
+                   ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
+
+        return k.run(2, globalsize, NULL, false);
+    }
+
+    return k.run(2, globalsize, 0, false);
+}
+
+#endif
+
+#ifdef HAVE_IPP
+#define IPP_RESIZE_PARALLEL 1
+
+#ifdef HAVE_IPP_IW
+class ipp_resizeParallel: public ParallelLoopBody
+{
+public:
+    ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
+        m_src(src), m_dst(dst), m_ok(ok) {}
+    ~ipp_resizeParallel()
+    {
+    }
+
+    void Init(IppiInterpolationType inter)
+    {
+        iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl);
+
+        m_ok = true;
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        CV_INSTRUMENT_REGION_IPP()
+
+        if(!m_ok)
+            return;
+
+        try
+        {
+            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
+            CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile);
+        }
+        catch(::ipp::IwException)
+        {
+            m_ok = false;
+            return;
+        }
+    }
+private:
+    ::ipp::IwiImage &m_src;
+    ::ipp::IwiImage &m_dst;
+
+    mutable ::ipp::IwiResize iwiResize;
+
+    volatile bool &m_ok;
+    const ipp_resizeParallel& operator= (const ipp_resizeParallel&);
+};
+
+class ipp_resizeAffineParallel: public ParallelLoopBody
+{
+public:
+    ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
+        m_src(src), m_dst(dst), m_ok(ok) {}
+    ~ipp_resizeAffineParallel()
+    {
+    }
+
+    void Init(IppiInterpolationType inter, double scaleX, double scaleY)
+    {
+        double shift = (inter == ippNearest)?-1e-10:-0.5;
+        double coeffs[2][3] = {
+            {scaleX, 0,      shift+0.5*scaleX},
+            {0,      scaleY, shift+0.5*scaleY}
+        };
+
+        iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl);
+
+        m_ok = true;
+    }
+
+    virtual void operator() (const Range& range) const
+    {
+        CV_INSTRUMENT_REGION_IPP()
+
+        if(!m_ok)
+            return;
+
+        try
+        {
+            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
+            CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile);
+        }
+        catch(::ipp::IwException)
+        {
+            m_ok = false;
+            return;
+        }
+    }
+private:
+    ::ipp::IwiImage &m_src;
+    ::ipp::IwiImage &m_dst;
+
+    mutable ::ipp::IwiWarpAffine iwiWarpAffine;
+
+    volatile bool &m_ok;
+    const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&);
+};
+#endif
+
+static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height,
+            uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y,
+            int depth, int channels, int interpolation)
+{
+#ifdef HAVE_IPP_IW
+    CV_INSTRUMENT_REGION_IPP()
+
+    IppDataType           ippDataType = ippiGetDataType(depth);
+    IppiInterpolationType ippInter    = ippiGetInterpolation(interpolation);
+    if(ippInter < 0)
+        return false;
+
+    // Resize which doesn't match OpenCV exactly
+    if (!cv::ipp::useIPP_NE())
+    {
+        if (ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear))
+            return false;
+    }
+
+    if(ippInter != ippLinear && ippDataType == ipp64f)
+        return false;
+
+#if IPP_VERSION_X100 < 201801
+    // Degradations on int^2 linear downscale
+    if (ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale
+    {
+        int scale_x = (int)(1 / inv_scale_x);
+        int scale_y = (int)(1 / inv_scale_y);
+        if (1 / inv_scale_x - scale_x < DBL_EPSILON && 1 / inv_scale_y - scale_y < DBL_EPSILON) // if integer
+        {
+            if (!(scale_x&(scale_x - 1)) && !(scale_y&(scale_y - 1))) // if power of 2
+                return false;
+        }
+    }
+#endif
+
+    bool  affine = false;
+    const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10;
+    double ex = fabs((double)dst_width / src_width  - inv_scale_x) / inv_scale_x;
+    double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y;
+
+    // Use affine transform resize to allow sub-pixel accuracy
+    if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS)
+        affine = true;
+
+    // Affine doesn't support Lanczos and Super interpolations
+    if(affine && (ippInter == ippLanczos || ippInter == ippSuper))
+        return false;
+
+    try
+    {
+        ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step);
+        ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step);
+
+        bool  ok;
+        int   threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height)));
+        Range range(0, dst_height);
+        ipp_resizeParallel       invokerGeneral(iwSrc, iwDst, ok);
+        ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok);
+        ParallelLoopBody        *pInvoker = NULL;
+        if(affine)
+        {
+            pInvoker = &invokerAffine;
+            invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y);
+        }
+        else
+        {
+            pInvoker = &invokerGeneral;
+            invokerGeneral.Init(ippInter);
+        }
+
+        if(IPP_RESIZE_PARALLEL && threads > 1)
+            parallel_for_(range, *pInvoker, threads*4);
+        else
+            pInvoker->operator()(range);
+
+        if(!ok)
+            return false;
+    }
+    catch(::ipp::IwException)
+    {
+        return false;
+    }
+    return true;
+#else
+    CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step);
+    CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth);
+    CV_UNUSED(channels); CV_UNUSED(interpolation);
+    return false;
+#endif
+}
+#endif
+
+//==================================================================================================
+
+namespace hal {
+
+void resize(int src_type,
+            const uchar * src_data, size_t src_step, int src_width, int src_height,
+            uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+            double inv_scale_x, double inv_scale_y, int interpolation)
+{
+    CV_INSTRUMENT_REGION()
+
+    CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
+    if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
+    {
+        inv_scale_x = static_cast<double>(dst_width) / src_width;
+        inv_scale_y = static_cast<double>(dst_height) / src_height;
+    }
+
+    CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
+
+    int  depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
+    Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
+                        saturate_cast<int>(src_height*inv_scale_y));
+    CV_Assert( dsize.area() > 0 );
+
+    CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation))
+
+    static ResizeFunc linear_tab[] =
+    {
+        resizeGeneric_<
+            HResizeLinear<uchar, int, short,
+                INTER_RESIZE_COEF_SCALE,
+                HResizeLinearVec_8u32s>,
+            VResizeLinear<uchar, int, short,
+                FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
+                VResizeLinearVec_32s8u> >,
+        0,
+        resizeGeneric_<
+            HResizeLinear<ushort, float, float, 1,
+                HResizeLinearVec_16u32f>,
+            VResizeLinear<ushort, float, float, Cast<float, ushort>,
+                VResizeLinearVec_32f16u> >,
+        resizeGeneric_<
+            HResizeLinear<short, float, float, 1,
+                HResizeLinearVec_16s32f>,
+            VResizeLinear<short, float, float, Cast<float, short>,
+                VResizeLinearVec_32f16s> >,
+        0,
+        resizeGeneric_<
+            HResizeLinear<float, float, float, 1,
+                HResizeLinearVec_32f>,
+            VResizeLinear<float, float, float, Cast<float, float>,
+                VResizeLinearVec_32f> >,
+        resizeGeneric_<
+            HResizeLinear<double, double, float, 1,
+                HResizeNoVec>,
+            VResizeLinear<double, double, float, Cast<double, double>,
+                VResizeNoVec> >,
+        0
+    };
+
+    static ResizeFunc cubic_tab[] =
+    {
+        resizeGeneric_<
+            HResizeCubic<uchar, int, short>,
+            VResizeCubic<uchar, int, short,
+                FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
+                VResizeCubicVec_32s8u> >,
+        0,
+        resizeGeneric_<
+            HResizeCubic<ushort, float, float>,
+            VResizeCubic<ushort, float, float, Cast<float, ushort>,
+            VResizeCubicVec_32f16u> >,
+        resizeGeneric_<
+            HResizeCubic<short, float, float>,
+            VResizeCubic<short, float, float, Cast<float, short>,
+            VResizeCubicVec_32f16s> >,
+        0,
+        resizeGeneric_<
+            HResizeCubic<float, float, float>,
+            VResizeCubic<float, float, float, Cast<float, float>,
+            VResizeCubicVec_32f> >,
+        resizeGeneric_<
+            HResizeCubic<double, double, float>,
+            VResizeCubic<double, double, float, Cast<double, double>,
+            VResizeNoVec> >,
+        0
+    };
+
+    static ResizeFunc lanczos4_tab[] =
+    {
+        resizeGeneric_<HResizeLanczos4<uchar, int, short>,
+            VResizeLanczos4<uchar, int, short,
+            FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
+            VResizeNoVec> >,
+        0,
+        resizeGeneric_<HResizeLanczos4<ushort, float, float>,
+            VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
+            VResizeLanczos4Vec_32f16u> >,
+        resizeGeneric_<HResizeLanczos4<short, float, float>,
+            VResizeLanczos4<short, float, float, Cast<float, short>,
+            VResizeLanczos4Vec_32f16s> >,
+        0,
+        resizeGeneric_<HResizeLanczos4<float, float, float>,
+            VResizeLanczos4<float, float, float, Cast<float, float>,
+            VResizeLanczos4Vec_32f> >,
+        resizeGeneric_<HResizeLanczos4<double, double, float>,
+            VResizeLanczos4<double, double, float, Cast<double, double>,
+            VResizeNoVec> >,
+        0
+    };
+
+    static ResizeAreaFastFunc areafast_tab[] =
+    {
+        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
+        0,
+        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
+        resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
+        0,
+        resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
+        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
+        0
+    };
+
+    static ResizeAreaFunc area_tab[] =
+    {
+        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
+        resizeArea_<short, float>, 0, resizeArea_<float, float>,
+        resizeArea_<double, double>, 0
+    };
+
+    double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
+
+    int iscale_x = saturate_cast<int>(scale_x);
+    int iscale_y = saturate_cast<int>(scale_y);
+
+    bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
+            std::abs(scale_y - iscale_y) < DBL_EPSILON;
+
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(dsize, src_type, dst_data, dst_step);
+
+    if( interpolation == INTER_NEAREST )
+    {
+        resizeNN( src, dst, inv_scale_x, inv_scale_y );
+        return;
+    }
+
+    int k, sx, sy, dx, dy;
+
+
+    {
+        // in case of scale_x && scale_y is equal to 2
+        // INTER_AREA (fast) also is equal to INTER_LINEAR
+        if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
+            interpolation = INTER_AREA;
+
+        // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
+        // In other cases it is emulated using some variant of bilinear interpolation
+        if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
+        {
+            if( is_area_fast )
+            {
+                int area = iscale_x*iscale_y;
+                size_t srcstep = src_step / src.elemSize1();
+                AutoBuffer<int> _ofs(area + dsize.width*cn);
+                int* ofs = _ofs;
+                int* xofs = ofs + area;
+                ResizeAreaFastFunc func = areafast_tab[depth];
+                CV_Assert( func != 0 );
+
+                for( sy = 0, k = 0; sy < iscale_y; sy++ )
+                    for( sx = 0; sx < iscale_x; sx++ )
+                        ofs[k++] = (int)(sy*srcstep + sx*cn);
+
+                for( dx = 0; dx < dsize.width; dx++ )
+                {
+                    int j = dx * cn;
+                    sx = iscale_x * j;
+                    for( k = 0; k < cn; k++ )
+                        xofs[j + k] = sx + k;
+                }
+
+                func( src, dst, ofs, xofs, iscale_x, iscale_y );
+                return;
+            }
+
+            ResizeAreaFunc func = area_tab[depth];
+            CV_Assert( func != 0 && cn <= 4 );
+
+            AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
+            DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
+
+            int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
+            int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
+
+            AutoBuffer<int> _tabofs(dsize.height + 1);
+            int* tabofs = _tabofs;
+            for( k = 0, dy = 0; k < ytab_size; k++ )
+            {
+                if( k == 0 || ytab[k].di != ytab[k-1].di )
+                {
+                    assert( ytab[k].di == dy );
+                    tabofs[dy++] = k;
+                }
+            }
+            tabofs[dy] = ytab_size;
+
+            func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
+            return;
+        }
+    }
+
+    int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
+    bool area_mode = interpolation == INTER_AREA;
+    bool fixpt = depth == CV_8U;
+    float fx, fy;
+    ResizeFunc func=0;
+    int ksize=0, ksize2;
+    if( interpolation == INTER_CUBIC )
+        ksize = 4, func = cubic_tab[depth];
+    else if( interpolation == INTER_LANCZOS4 )
+        ksize = 8, func = lanczos4_tab[depth];
+    else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
+        ksize = 2, func = linear_tab[depth];
+    else
+        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+    ksize2 = ksize/2;
+
+    CV_Assert( func != 0 );
+
+    AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
+    int* xofs = (int*)(uchar*)_buffer;
+    int* yofs = xofs + width;
+    float* alpha = (float*)(yofs + dsize.height);
+    short* ialpha = (short*)alpha;
+    float* beta = alpha + width*ksize;
+    short* ibeta = ialpha + width*ksize;
+    float cbuf[MAX_ESIZE];
+
+    for( dx = 0; dx < dsize.width; dx++ )
+    {
+        if( !area_mode )
+        {
+            fx = (float)((dx+0.5)*scale_x - 0.5);
+            sx = cvFloor(fx);
+            fx -= sx;
+        }
+        else
+        {
+            sx = cvFloor(dx*scale_x);
+            fx = (float)((dx+1) - (sx+1)*inv_scale_x);
+            fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
+        }
+
+        if( sx < ksize2-1 )
+        {
+            xmin = dx+1;
+            if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
+                fx = 0, sx = 0;
+        }
+
+        if( sx + ksize2 >= src_width )
+        {
+            xmax = std::min( xmax, dx );
+            if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
+                fx = 0, sx = src_width-1;
+        }
+
+        for( k = 0, sx *= cn; k < cn; k++ )
+            xofs[dx*cn + k] = sx + k;
+
+        if( interpolation == INTER_CUBIC )
+            interpolateCubic( fx, cbuf );
+        else if( interpolation == INTER_LANCZOS4 )
+            interpolateLanczos4( fx, cbuf );
+        else
+        {
+            cbuf[0] = 1.f - fx;
+            cbuf[1] = fx;
+        }
+        if( fixpt )
+        {
+            for( k = 0; k < ksize; k++ )
+                ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
+            for( ; k < cn*ksize; k++ )
+                ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
+        }
+        else
+        {
+            for( k = 0; k < ksize; k++ )
+                alpha[dx*cn*ksize + k] = cbuf[k];
+            for( ; k < cn*ksize; k++ )
+                alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
+        }
+    }
+
+    for( dy = 0; dy < dsize.height; dy++ )
+    {
+        if( !area_mode )
+        {
+            fy = (float)((dy+0.5)*scale_y - 0.5);
+            sy = cvFloor(fy);
+            fy -= sy;
+        }
+        else
+        {
+            sy = cvFloor(dy*scale_y);
+            fy = (float)((dy+1) - (sy+1)*inv_scale_y);
+            fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
+        }
+
+        yofs[dy] = sy;
+        if( interpolation == INTER_CUBIC )
+            interpolateCubic( fy, cbuf );
+        else if( interpolation == INTER_LANCZOS4 )
+            interpolateLanczos4( fy, cbuf );
+        else
+        {
+            cbuf[0] = 1.f - fy;
+            cbuf[1] = fy;
+        }
+
+        if( fixpt )
+        {
+            for( k = 0; k < ksize; k++ )
+                ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
+        }
+        else
+        {
+            for( k = 0; k < ksize; k++ )
+                beta[dy*ksize + k] = cbuf[k];
+        }
+    }
+
+    func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
+          fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
+}
+
+} // cv::hal::
+} // cv::
+
+//==================================================================================================
+
+void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
+                 double inv_scale_x, double inv_scale_y, int interpolation )
+{
+    CV_INSTRUMENT_REGION()
+
+    Size ssize = _src.size();
+
+    CV_Assert( ssize.width > 0 && ssize.height > 0 );
+    CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
+    if( dsize.area() == 0 )
+    {
+        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
+                     saturate_cast<int>(ssize.height*inv_scale_y));
+        CV_Assert( dsize.area() > 0 );
+    }
+    else
+    {
+        inv_scale_x = (double)dsize.width/ssize.width;
+        inv_scale_y = (double)dsize.height/ssize.height;
+    }
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
+               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
+
+    Mat src = _src.getMat();
+    _dst.create(dsize, src.type());
+    Mat dst = _dst.getMat();
+
+    if (dsize == ssize)
+    {
+        // Source and destination are of same size. Use simple copy.
+        src.copyTo(dst);
+        return;
+    }
+
+    hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
+}
+
+
+CV_IMPL void
+cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
+{
+    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src.type() == dst.type() );
+    cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
+        (double)dst.rows/src.rows, method );
+}
+
+/* End of file. */
diff --git a/modules/imgproc/src/resize.hpp b/modules/imgproc/src/resize.hpp
new file mode 100644
index 0000000000..8533306bd4
--- /dev/null
+++ b/modules/imgproc/src/resize.hpp
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#ifndef OPENCV_IMGPROC_RESIZE_HPP
+#define OPENCV_IMGPROC_RESIZE_HPP
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace opt_AVX2
+{
+#if CV_TRY_AVX2
+void resizeNN2_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
+void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
+#endif
+}
+
+namespace opt_SSE4_1
+{
+#if CV_TRY_SSE4_1
+void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
+void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
+
+int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width);
+#endif
+}
+}
+#endif
+/* End of file. */
diff --git a/modules/imgproc/src/resize.sse4_1.cpp b/modules/imgproc/src/resize.sse4_1.cpp
new file mode 100644
index 0000000000..edcefcb9e7
--- /dev/null
+++ b/modules/imgproc/src/resize.sse4_1.cpp
@@ -0,0 +1,233 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "resize.hpp"
+
+namespace cv
+{
+namespace opt_SSE4_1
+{
+
+class resizeNNInvokerSSE2 :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+
+#if defined(__INTEL_COMPILER)
+#pragma optimization_parameter target_arch=SSE4.2
+#endif
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int sseWidth = width - (width & 0x7);
+        for(y = range.start; y < range.end; y++)
+        {
+            uchar* D = dst.data + dst.step*y;
+            uchar* Dstart = D;
+            int sy = std::min(cvFloor(y*ify), ssize.height-1);
+            const uchar* S = src.data + sy*src.step;
+            __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
+            for(x = 0; x < sseWidth; x += 8)
+            {
+                ushort imm = *(ushort*)(S + x_ofs[x + 0]);
+                pixels = _mm_insert_epi16(pixels, imm, 0);
+                imm = *(ushort*)(S + x_ofs[x + 1]);
+                pixels = _mm_insert_epi16(pixels, imm, 1);
+                imm = *(ushort*)(S + x_ofs[x + 2]);
+                pixels = _mm_insert_epi16(pixels, imm, 2);
+                imm = *(ushort*)(S + x_ofs[x + 3]);
+                pixels = _mm_insert_epi16(pixels, imm, 3);
+                imm = *(ushort*)(S + x_ofs[x + 4]);
+                pixels = _mm_insert_epi16(pixels, imm, 4);
+                imm = *(ushort*)(S + x_ofs[x + 5]);
+                pixels = _mm_insert_epi16(pixels, imm, 5);
+                imm = *(ushort*)(S + x_ofs[x + 6]);
+                pixels = _mm_insert_epi16(pixels, imm, 6);
+                imm = *(ushort*)(S + x_ofs[x + 7]);
+                pixels = _mm_insert_epi16(pixels, imm, 7);
+                _mm_storeu_si128((__m128i*)D, pixels);
+                D += 16;
+            }
+            for(; x < width; x++)
+            {
+                *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+            }
+        }
+    }
+
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+
+    resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
+    resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
+};
+
+class resizeNNInvokerSSE4 :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+#if defined(__INTEL_COMPILER)
+#pragma optimization_parameter target_arch=SSE4.2
+#endif
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int sseWidth = width - (width & 0x3);
+        for(y = range.start; y < range.end; y++)
+        {
+            uchar* D = dst.data + dst.step*y;
+            uchar* Dstart = D;
+            int sy = std::min(cvFloor(y*ify), ssize.height-1);
+            const uchar* S = src.data + sy*src.step;
+            __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
+            for(x = 0; x < sseWidth; x += 4)
+            {
+                int imm = *(int*)(S + x_ofs[x + 0]);
+                pixels = _mm_insert_epi32(pixels, imm, 0);
+                imm = *(int*)(S + x_ofs[x + 1]);
+                pixels = _mm_insert_epi32(pixels, imm, 1);
+                imm = *(int*)(S + x_ofs[x + 2]);
+                pixels = _mm_insert_epi32(pixels, imm, 2);
+                imm = *(int*)(S + x_ofs[x + 3]);
+                pixels = _mm_insert_epi32(pixels, imm, 3);
+                _mm_storeu_si128((__m128i*)D, pixels);
+                D += 16;
+            }
+            for(; x < width; x++)
+            {
+                *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+            }
+        }
+    }
+
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+
+    resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
+    resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
+};
+
+void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
+{
+    resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
+{
+    resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width)
+{
+    const float** src = (const float**)_src;
+    const float* beta = (const float*)_beta;
+    const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+        *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
+    short * dst = (short*)_dst;
+    int x = 0;
+    __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
+        v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
+        v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
+        v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
+
+    for (; x <= width - 8; x += 8)
+    {
+        __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
+        v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
+
+        __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
+        v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
+
+        __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
+        __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
+
+        _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
+    }
+
+    return x;
+}
+
+}
+}
+/* End of file. */