From 8fdbdb131d3b2bb6de6d0bda1808a06a5f5114b8 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 2 Jul 2014 01:32:45 +0400 Subject: [PATCH] SSE4.1 optimiation of cv::Moments CV_16U --- modules/imgproc/src/moments.cpp | 161 ++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 58 deletions(-) diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 61fff29852..a61002a792 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -203,69 +203,27 @@ static Moments contourMoments( const Mat& contour ) \****************************************************************************************/ template -#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9 -// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196 -__attribute__((optimize("no-tree-vectorize"))) -#endif -static void momentsInTile( const Mat& img, double* moments ) +struct MomentsInTile_SSE { - Size size = img.size(); - int x, y; - MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; - - for( y = 0; y < size.height; y++ ) + int operator() (const T *, int, WT &, WT &, WT &, MT &) { - const T* ptr = (const T*)(img.data + y*img.step); - WT x0 = 0, x1 = 0, x2 = 0; - MT x3 = 0; - - for( x = 0; x < size.width; x++ ) - { - WT p = ptr[x]; - WT xp = x * p, xxp; - - x0 += p; - x1 += xp; - xxp = xp * x; - x2 += xxp; - x3 += xxp * x; - } - - WT py = y * x0, sy = y*y; - - mom[9] += ((MT)py) * sy; // m03 - mom[8] += ((MT)x1) * sy; // m12 - mom[7] += ((MT)x2) * y; // m21 - mom[6] += x3; // m30 - mom[5] += x0 * sy; // m02 - mom[4] += x1 * y; // m11 - mom[3] += x2; // m20 - mom[2] += py; // m01 - mom[1] += x1; // m10 - mom[0] += x0; // m00 + return 0; } - - for( x = 0; x < 10; x++ ) - moments[x] = (double)mom[x]; -} - +}; #if CV_SSE2 -template<> void momentsInTile( const cv::Mat& img, double* moments ) +template <> +struct MomentsInTile_SSE { - typedef uchar T; - typedef int WT; - typedef int MT; - Size size = img.size(); - int y; - MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + MomentsInTile_SSE() + { + useSIMD = checkHardwareSupport(CV_CPU_SSE2); + } - for( y = 0; y < size.height; y++ ) + int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) { - const T* ptr = img.ptr(y); - int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0; + int x = 0; if( useSIMD ) { @@ -273,7 +231,7 @@ template<> void momentsInTile( const cv::Mat& img, double* mome __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; - for( ; x <= size.width - 8; x += 8 ) + for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); @@ -285,6 +243,7 @@ template<> void momentsInTile( const cv::Mat& img, double* mome qx = _mm_add_epi16(qx, dx); } + int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; @@ -296,6 +255,94 @@ template<> void momentsInTile( const cv::Mat& img, double* mome x3 = buf[0] + buf[1] + buf[2] + buf[3]; } + return x; + } + + bool useSIMD; +}; + +#endif + +#if CV_SSE4_1 + +template <> +struct MomentsInTile_SSE +{ + MomentsInTile_SSE() + { + useSIMD = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator() (const ushort * ptr, int len, int & x0, int & x1, int & x2, int64 & x3) + { + int x = 0; + + if (useSIMD) + { + __m128i vx_init0 = _mm_setr_epi32(0, 1, 2, 3), vx_init1 = _mm_setr_epi32(4, 5, 6, 7), + v_delta = _mm_set1_epi32(8), v_zero = _mm_setzero_si128(), v_x0 = v_zero, + v_x1 = v_zero, v_x2 = v_zero, v_x3 = v_zero, v_ix0 = vx_init0, v_ix1 = vx_init1; + + for( ; x <= len - 8; x += 8 ) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(ptr + x)); + __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_zero), v_src1 = _mm_unpackhi_epi16(v_src, v_zero); + + v_x0 = _mm_add_epi32(v_x0, _mm_add_epi32(v_src0, v_src1)); + __m128i v_x1_0 = _mm_mullo_epi32(v_src0, v_ix0), v_x1_1 = _mm_mullo_epi32(v_src1, v_ix1); + v_x1 = _mm_add_epi32(v_x1, _mm_add_epi32(v_x1_0, v_x1_1)); + + __m128i v_2ix0 = _mm_mullo_epi32(v_ix0, v_ix0), v_2ix1 = _mm_mullo_epi32(v_ix1, v_ix1); + v_x2 = _mm_add_epi32(v_x2, _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_src0), _mm_mullo_epi32(v_2ix1, v_src1))); + + __m128i t = _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_x1_0), _mm_mullo_epi32(v_2ix1, v_x1_1)); + v_x3 = _mm_add_epi64(v_x3, _mm_add_epi64(_mm_unpacklo_epi32(t, v_zero), _mm_unpackhi_epi32(t, v_zero))); + + v_ix0 = _mm_add_epi32(v_ix0, v_delta); + v_ix1 = _mm_add_epi32(v_ix1, v_delta); + } + + int CV_DECL_ALIGNED(16) buf[4]; + int64 CV_DECL_ALIGNED(16) buf64[2]; + + _mm_store_si128((__m128i*)buf, v_x0); + x0 = buf[0] + buf[1] + buf[2] + buf[3]; + _mm_store_si128((__m128i*)buf, v_x1); + x1 = buf[0] + buf[1] + buf[2] + buf[3]; + _mm_store_si128((__m128i*)buf, v_x2); + x2 = buf[0] + buf[1] + buf[2] + buf[3]; + + _mm_store_si128((__m128i*)buf64, v_x3); + x3 = buf64[0] + buf64[1]; + } + + return x; + } + + bool useSIMD; +}; + +#endif + +template +#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9 +// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196 +__attribute__((optimize("no-tree-vectorize"))) +#endif +static void momentsInTile( const Mat& img, double* moments ) +{ + Size size = img.size(); + int x, y; + MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; + MomentsInTile_SSE vop; + + for( y = 0; y < size.height; y++ ) + { + const T* ptr = (const T*)(img.data + y*img.step); + WT x0 = 0, x1 = 0, x2 = 0; + MT x3 = 0; + x = vop(ptr, size.width, x0, x1, x2, x3); + for( ; x < size.width; x++ ) { WT p = ptr[x]; @@ -322,12 +369,10 @@ template<> void momentsInTile( const cv::Mat& img, double* mome mom[0] += x0; // m00 } - for(int x = 0; x < 10; x++ ) + for( x = 0; x < 10; x++ ) moments[x] = (double)mom[x]; } -#endif - typedef void (*MomentsInTileFunc)(const Mat& img, double* moments); Moments::Moments()