From cba22349b71278e7fba46bd4e43dc0eac518f2db Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Sat, 29 Oct 2016 13:24:31 +0900 Subject: [PATCH] add universal hardware support check function * use hasSIMD128 rather than calling checkHardwareSupport * add SIMD check in spartialgradient.cpp * add SIMD check in stereosgbm.cpp * add SIMD check in canny.cpp --- modules/calib3d/src/stereosgbm.cpp | 10 +- .../include/opencv2/core/hal/intrin_cpp.hpp | 11 + .../include/opencv2/core/hal/intrin_neon.hpp | 11 + .../include/opencv2/core/hal/intrin_sse.hpp | 11 + modules/core/src/arithm_simd.hpp | 28 +- modules/imgproc/src/canny.cpp | 6 +- modules/imgproc/src/spatialgradient.cpp | 267 +++++++++--------- 7 files changed, 190 insertions(+), 154 deletions(-) diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 45873941a4..203cbc6c31 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -132,7 +132,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, const PixType *row1 = img1.ptr(y), *row2 = img2.ptr(y); PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2; #if CV_SIMD128 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + bool useSIMD = hasSIMD128(); #endif tab += tabOfs; @@ -292,7 +292,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, }; static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + bool useSIMD = hasSIMD128(); #endif const int ALIGN = 16; @@ -891,7 +891,7 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli ftzero = std::max(params.preFilterCap, 15) | 1; #if CV_SIMD128 - useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + useSIMD = hasSIMD128(); #endif } @@ -1054,7 +1054,7 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2) { #if CV_SIMD128 - if(checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) + if(hasSIMD128()) { v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast(P1)); @@ -1166,7 +1166,7 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost) { #if CV_SIMD128 - if(checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) + if(hasSIMD128()) { v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast(P1)); diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 94aead605a..8e8b691559 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1772,6 +1772,17 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, //! @} +//! @name Check SIMD support +//! @{ +//! @brief Check CPU capability of SIMD operation +static inline bool hasSIMD128() +{ + return false; +} + +//! @} + + } #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 1c59c2a35c..daf3d2e07e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -46,6 +46,7 @@ #define OPENCV_HAL_INTRIN_NEON_HPP #include +#include "opencv2/core/utility.hpp" namespace cv { @@ -1218,6 +1219,16 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) } #endif +//! @name Check SIMD support +//! @{ +//! @brief Check CPU capability of SIMD operation +static inline bool hasSIMD128() +{ + return checkHardwareSupport(CV_CPU_NEON); +} + +//! @} + //! @endcond } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 16d7eb8444..fc81dac35d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -46,6 +46,7 @@ #define OPENCV_HAL_SSE_HPP #include +#include "opencv2/core/utility.hpp" #define CV_SIMD128 1 #define CV_SIMD128_64F 1 @@ -1726,6 +1727,16 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) } #endif +//! @name Check SIMD support +//! @{ +//! @brief Check CPU capability of SIMD operation +static inline bool hasSIMD128() +{ + return checkHardwareSupport(CV_CPU_SSE2); +} + +//! @} + //! @endcond } diff --git a/modules/core/src/arithm_simd.hpp b/modules/core/src/arithm_simd.hpp index b6a549ed92..7d72383a31 100644 --- a/modules/core/src/arithm_simd.hpp +++ b/modules/core/src/arithm_simd.hpp @@ -1197,7 +1197,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { @@ -1243,7 +1243,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { @@ -1289,7 +1289,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { @@ -1334,7 +1334,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { @@ -1379,7 +1379,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { @@ -1423,7 +1423,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const { @@ -1463,7 +1463,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const uchar * src2, uchar * dst, int width, double scale) const { @@ -1504,7 +1504,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const schar * src2, schar * dst, int width, double scale) const { @@ -1545,7 +1545,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const ushort * src2, ushort * dst, int width, double scale) const { @@ -1585,7 +1585,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const short * src2, short * dst, int width, double scale) const { @@ -1625,7 +1625,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const int * src2, int * dst, int width, double scale) const { @@ -1665,7 +1665,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const float * src2, float * dst, int width, double scale) const { @@ -1702,7 +1702,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Div_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const { @@ -1739,7 +1739,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + Recip_SIMD() { haveSIMD = hasSIMD128(); } int operator() (const double * src2, double * dst, int width, double scale) const { diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 19fc13b380..a9fbe8bff2 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -301,7 +301,7 @@ public: void operator()(const Range &boundaries) const { #if CV_SIMD128 - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + bool haveSIMD = hasSIMD128(); #endif const int type = src.type(), cn = CV_MAT_CN(type); @@ -709,7 +709,7 @@ public: uchar* pdst = dst.ptr() + (ptrdiff_t)(dst.step * boundaries.start); #if CV_SIMD128 - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + bool haveSIMD = hasSIMD128(); #endif for (int i = boundaries.start; i < boundaries.end; i++, pmap += mapstep, pdst += dst.step) @@ -962,7 +962,7 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, #define CANNY_POP(d) (d) = *--stack_top #if CV_SIMD128 - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + bool haveSIMD = hasSIMD128(); #endif // calculate magnitude and angle of gradient, perform non-maxima suppression. diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp index ecf52f9ce1..9217558181 100644 --- a/modules/imgproc/src/spatialgradient.cpp +++ b/modules/imgproc/src/spatialgradient.cpp @@ -130,140 +130,143 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, int i_start = 0; int j_start = 0; #if CV_SIMD128 && CV_SSE2 - uchar *m_src; - short *n_dx, *n_dy; - - // Characters in variable names have the following meanings: - // u: unsigned char - // s: signed int - // - // [row][column] - // m: offset -1 - // n: offset 0 - // p: offset 1 - // Example: umn is offset -1 in row and offset 0 in column - for ( i = 0; i < H - 1; i += 2 ) + if(hasSIMD128()) { - if ( i == 0 ) p_src = src.ptr(i_top); - else p_src = src.ptr(i-1); - - c_src = src.ptr(i); - n_src = src.ptr(i+1); - - if ( i == H - 2 ) m_src = src.ptr(i_bottom); - else m_src = src.ptr(i+2); - - c_dx = dx.ptr(i); - c_dy = dy.ptr(i); - n_dx = dx.ptr(i+1); - n_dy = dy.ptr(i+1); - - v_uint8x16 v_select_m = v_uint8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0xFF); - - // Process rest of columns 16-column chunks at a time - for ( j = 1; j < W - 16; j += 16 ) + uchar *m_src; + short *n_dx, *n_dy; + + // Characters in variable names have the following meanings: + // u: unsigned char + // s: signed int + // + // [row][column] + // m: offset -1 + // n: offset 0 + // p: offset 1 + // Example: umn is offset -1 in row and offset 0 in column + for ( i = 0; i < H - 1; i += 2 ) { - // Load top row for 3x3 Sobel filter - v_uint8x16 v_um = v_load(&p_src[j-1]); - v_uint8x16 v_up = v_load(&p_src[j+1]); - // TODO: Replace _mm_slli_si128 with hal method - v_uint8x16 v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), - v_uint8x16(_mm_srli_si128(v_um.val, 1))); - v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); - - // Load second row for 3x3 Sobel filter - v_um = v_load(&c_src[j-1]); - v_up = v_load(&c_src[j+1]); - // TODO: Replace _mm_slli_si128 with hal method - v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), - v_uint8x16(_mm_srli_si128(v_um.val, 1))); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); - - // Load third row for 3x3 Sobel filter - v_um = v_load(&n_src[j-1]); - v_up = v_load(&n_src[j+1]); - // TODO: Replace _mm_slli_si128 with hal method - v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), - v_uint8x16(_mm_srli_si128(v_um.val, 1))); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); - - // dx & dy for rows 1, 2, 3 - v_int16x8 v_sdx1, v_sdy1; - spatialGradientKernel( v_sdx1, v_sdy1, - v_s1m1, v_s1n1, v_s1p1, - v_s2m1, v_s2p1, - v_s3m1, v_s3n1, v_s3p1 ); - - v_int16x8 v_sdx2, v_sdy2; - spatialGradientKernel( v_sdx2, v_sdy2, - v_s1m2, v_s1n2, v_s1p2, - v_s2m2, v_s2p2, - v_s3m2, v_s3n2, v_s3p2 ); - - // Store - v_store(&c_dx[j], v_sdx1); - v_store(&c_dx[j+8], v_sdx2); - v_store(&c_dy[j], v_sdy1); - v_store(&c_dy[j+8], v_sdy2); - - // Load fourth row for 3x3 Sobel filter - v_um = v_load(&m_src[j-1]); - v_up = v_load(&m_src[j+1]); - // TODO: Replace _mm_slli_si128 with hal method - v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), - v_uint8x16(_mm_srli_si128(v_um.val, 1))); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); - - // dx & dy for rows 2, 3, 4 - spatialGradientKernel( v_sdx1, v_sdy1, - v_s2m1, v_s2n1, v_s2p1, - v_s3m1, v_s3p1, - v_s4m1, v_s4n1, v_s4p1 ); - - spatialGradientKernel( v_sdx2, v_sdy2, - v_s2m2, v_s2n2, v_s2p2, - v_s3m2, v_s3p2, - v_s4m2, v_s4n2, v_s4p2 ); - - // Store - v_store(&n_dx[j], v_sdx1); - v_store(&n_dx[j+8], v_sdx2); - v_store(&n_dy[j], v_sdy1); - v_store(&n_dy[j+8], v_sdy2); + if ( i == 0 ) p_src = src.ptr(i_top); + else p_src = src.ptr(i-1); + + c_src = src.ptr(i); + n_src = src.ptr(i+1); + + if ( i == H - 2 ) m_src = src.ptr(i_bottom); + else m_src = src.ptr(i+2); + + c_dx = dx.ptr(i); + c_dy = dy.ptr(i); + n_dx = dx.ptr(i+1); + n_dy = dy.ptr(i+1); + + v_uint8x16 v_select_m = v_uint8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0xFF); + + // Process rest of columns 16-column chunks at a time + for ( j = 1; j < W - 16; j += 16 ) + { + // Load top row for 3x3 Sobel filter + v_uint8x16 v_um = v_load(&p_src[j-1]); + v_uint8x16 v_up = v_load(&p_src[j+1]); + // TODO: Replace _mm_slli_si128 with hal method + v_uint8x16 v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), + v_uint8x16(_mm_srli_si128(v_um.val, 1))); + v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); + v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); + v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); + v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); + v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); + v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); + + // Load second row for 3x3 Sobel filter + v_um = v_load(&c_src[j-1]); + v_up = v_load(&c_src[j+1]); + // TODO: Replace _mm_slli_si128 with hal method + v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), + v_uint8x16(_mm_srli_si128(v_um.val, 1))); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); + v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); + v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); + v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); + v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); + v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); + + // Load third row for 3x3 Sobel filter + v_um = v_load(&n_src[j-1]); + v_up = v_load(&n_src[j+1]); + // TODO: Replace _mm_slli_si128 with hal method + v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), + v_uint8x16(_mm_srli_si128(v_um.val, 1))); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); + v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); + v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); + v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); + v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); + v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); + + // dx & dy for rows 1, 2, 3 + v_int16x8 v_sdx1, v_sdy1; + spatialGradientKernel( v_sdx1, v_sdy1, + v_s1m1, v_s1n1, v_s1p1, + v_s2m1, v_s2p1, + v_s3m1, v_s3n1, v_s3p1 ); + + v_int16x8 v_sdx2, v_sdy2; + spatialGradientKernel( v_sdx2, v_sdy2, + v_s1m2, v_s1n2, v_s1p2, + v_s2m2, v_s2p2, + v_s3m2, v_s3n2, v_s3p2 ); + + // Store + v_store(&c_dx[j], v_sdx1); + v_store(&c_dx[j+8], v_sdx2); + v_store(&c_dy[j], v_sdy1); + v_store(&c_dy[j+8], v_sdy2); + + // Load fourth row for 3x3 Sobel filter + v_um = v_load(&m_src[j-1]); + v_up = v_load(&m_src[j+1]); + // TODO: Replace _mm_slli_si128 with hal method + v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), + v_uint8x16(_mm_srli_si128(v_um.val, 1))); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); + v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); + v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); + v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); + v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); + v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); + + // dx & dy for rows 2, 3, 4 + spatialGradientKernel( v_sdx1, v_sdy1, + v_s2m1, v_s2n1, v_s2p1, + v_s3m1, v_s3p1, + v_s4m1, v_s4n1, v_s4p1 ); + + spatialGradientKernel( v_sdx2, v_sdy2, + v_s2m2, v_s2n2, v_s2p2, + v_s3m2, v_s3p2, + v_s4m2, v_s4n2, v_s4p2 ); + + // Store + v_store(&n_dx[j], v_sdx1); + v_store(&n_dx[j+8], v_sdx2); + v_store(&n_dy[j], v_sdy1); + v_store(&n_dy[j+8], v_sdy2); + } } } i_start = i;