From 2f5af1bd33df8f0ebb2ba8d5c2c3144fdca898c5 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 30 Jan 2019 22:37:27 +0300 Subject: [PATCH] Merge pull request #13693 from terfendail:spatialgrad_wintr * spatialGradient() reworked to use wide universal intrinsics * Moved row pointers inside loops --- modules/imgproc/src/spatialgradient.cpp | 258 +++++++++++------------- 1 file changed, 120 insertions(+), 138 deletions(-) diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp index c942264e00..1aed1fa031 100644 --- a/modules/imgproc/src/spatialgradient.cpp +++ b/modules/imgproc/src/spatialgradient.cpp @@ -123,139 +123,125 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, } } - // Pointer to row vectors - uchar *p_src, *c_src, *n_src; // previous, current, next row - short *c_dx, *c_dy; - int i_start = 0; int j_start = 0; -#if CV_SIMD128 - if(hasSIMD128()) +#if CV_SIMD + // Characters in variable names have the following meanings: + // u: unsigned char + // s: signed int + // + // [row][column] + // m: offset -1 + // n: offset 0 + // p: offset 1 + // Example: umn is offset -1 in row and offset 0 in column + for ( i = 0; i < H - 1; i += 2 ) { - uchar *m_src; - short *n_dx, *n_dy; - - // Characters in variable names have the following meanings: - // u: unsigned char - // s: signed int - // - // [row][column] - // m: offset -1 - // n: offset 0 - // p: offset 1 - // Example: umn is offset -1 in row and offset 0 in column - for ( i = 0; i < H - 1; i += 2 ) + uchar *p_src = src.ptr(i == 0 ? i_top : i - 1); + uchar *c_src = src.ptr(i); + uchar *n_src = src.ptr(i+1); + uchar *m_src = src.ptr(i == H - 2 ? i_bottom : i + 2); + + short *c_dx = dx.ptr(i); + short *c_dy = dy.ptr(i); + short *n_dx = dx.ptr(i+1); + short *n_dy = dy.ptr(i+1); + + // Process rest of columns 16-column chunks at a time + for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes) { - if ( i == 0 ) p_src = src.ptr(i_top); - else p_src = src.ptr(i-1); - - c_src = src.ptr(i); - n_src = src.ptr(i+1); - - if ( i == H - 2 ) m_src = src.ptr(i_bottom); - else m_src = src.ptr(i+2); - - c_dx = dx.ptr(i); - c_dy = dy.ptr(i); - n_dx = dx.ptr(i+1); - n_dy = dy.ptr(i+1); - - // Process rest of columns 16-column chunks at a time - for ( j = 1; j < W - 16; j += 16 ) - { - // Load top row for 3x3 Sobel filter - v_uint8x16 v_um = v_load(&p_src[j-1]); - v_uint8x16 v_un = v_load(&p_src[j]); - v_uint8x16 v_up = v_load(&p_src[j+1]); - v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); - - // Load second row for 3x3 Sobel filter - v_um = v_load(&c_src[j-1]); - v_un = v_load(&c_src[j]); - v_up = v_load(&c_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); - - // Load third row for 3x3 Sobel filter - v_um = v_load(&n_src[j-1]); - v_un = v_load(&n_src[j]); - v_up = v_load(&n_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); - - // dx & dy for rows 1, 2, 3 - v_int16x8 v_sdx1, v_sdy1; - spatialGradientKernel( v_sdx1, v_sdy1, - v_s1m1, v_s1n1, v_s1p1, - v_s2m1, v_s2p1, - v_s3m1, v_s3n1, v_s3p1 ); - - v_int16x8 v_sdx2, v_sdy2; - spatialGradientKernel( v_sdx2, v_sdy2, - v_s1m2, v_s1n2, v_s1p2, - v_s2m2, v_s2p2, - v_s3m2, v_s3n2, v_s3p2 ); - - // Store - v_store(&c_dx[j], v_sdx1); - v_store(&c_dx[j+8], v_sdx2); - v_store(&c_dy[j], v_sdy1); - v_store(&c_dy[j+8], v_sdy2); - - // Load fourth row for 3x3 Sobel filter - v_um = v_load(&m_src[j-1]); - v_un = v_load(&m_src[j]); - v_up = v_load(&m_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); - - // dx & dy for rows 2, 3, 4 - spatialGradientKernel( v_sdx1, v_sdy1, - v_s2m1, v_s2n1, v_s2p1, - v_s3m1, v_s3p1, - v_s4m1, v_s4n1, v_s4p1 ); - - spatialGradientKernel( v_sdx2, v_sdy2, - v_s2m2, v_s2n2, v_s2p2, - v_s3m2, v_s3p2, - v_s4m2, v_s4n2, v_s4p2 ); - - // Store - v_store(&n_dx[j], v_sdx1); - v_store(&n_dx[j+8], v_sdx2); - v_store(&n_dy[j], v_sdy1); - v_store(&n_dy[j+8], v_sdy2); - } + // Load top row for 3x3 Sobel filter + v_uint8 v_um = vx_load(&p_src[j-1]); + v_uint8 v_un = vx_load(&p_src[j]); + v_uint8 v_up = vx_load(&p_src[j+1]); + v_uint16 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s1m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s1m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s1n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s1n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s1p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s1p2 = v_reinterpret_as_s16(v_up2); + + // Load second row for 3x3 Sobel filter + v_um = vx_load(&c_src[j-1]); + v_un = vx_load(&c_src[j]); + v_up = vx_load(&c_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s2m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s2m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s2n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s2n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s2p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s2p2 = v_reinterpret_as_s16(v_up2); + + // Load third row for 3x3 Sobel filter + v_um = vx_load(&n_src[j-1]); + v_un = vx_load(&n_src[j]); + v_up = vx_load(&n_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s3m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s3m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s3n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s3n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s3p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s3p2 = v_reinterpret_as_s16(v_up2); + + // dx & dy for rows 1, 2, 3 + v_int16 v_sdx1, v_sdy1; + spatialGradientKernel( v_sdx1, v_sdy1, + v_s1m1, v_s1n1, v_s1p1, + v_s2m1, v_s2p1, + v_s3m1, v_s3n1, v_s3p1 ); + + v_int16 v_sdx2, v_sdy2; + spatialGradientKernel( v_sdx2, v_sdy2, + v_s1m2, v_s1n2, v_s1p2, + v_s2m2, v_s2p2, + v_s3m2, v_s3n2, v_s3p2 ); + + // Store + v_store(&c_dx[j], v_sdx1); + v_store(&c_dx[j+v_int16::nlanes], v_sdx2); + v_store(&c_dy[j], v_sdy1); + v_store(&c_dy[j+v_int16::nlanes], v_sdy2); + + // Load fourth row for 3x3 Sobel filter + v_um = vx_load(&m_src[j-1]); + v_un = vx_load(&m_src[j]); + v_up = vx_load(&m_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s4m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s4m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s4n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s4n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s4p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2); + + // dx & dy for rows 2, 3, 4 + spatialGradientKernel( v_sdx1, v_sdy1, + v_s2m1, v_s2n1, v_s2p1, + v_s3m1, v_s3p1, + v_s4m1, v_s4n1, v_s4p1 ); + + spatialGradientKernel( v_sdx2, v_sdy2, + v_s2m2, v_s2n2, v_s2p2, + v_s3m2, v_s3p2, + v_s4m2, v_s4n2, v_s4p2 ); + + // Store + v_store(&n_dx[j], v_sdx1); + v_store(&n_dx[j+v_int16::nlanes], v_sdx2); + v_store(&n_dy[j], v_sdy1); + v_store(&n_dy[j+v_int16::nlanes], v_sdy2); } } i_start = i; @@ -265,16 +251,12 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, uchar v00, v01, v02, v10, v11, v12, v20, v21, v22; for ( i = 0; i < H; i++ ) { - if ( i == 0 ) p_src = src.ptr(i_top); - else p_src = src.ptr(i-1); - - c_src = src.ptr(i); - - if ( i == H - 1 ) n_src = src.ptr(i_bottom); - else n_src = src.ptr(i+1); + uchar *p_src = src.ptr(i == 0 ? i_top : i - 1); + uchar *c_src = src.ptr(i); + uchar *n_src = src.ptr(i == H - 1 ? i_bottom : i + 1); - c_dx = dx.ptr(i); - c_dy = dy.ptr(i); + short *c_dx = dx.ptr(i); + short *c_dy = dy.ptr(i); // Process left-most column j = 0;