diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 15c4a61039..41a9cf36c7 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -227,16 +227,15 @@ struct MomentsInTile_SIMD if( useSIMD ) { - __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); - __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; + __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); __m128i sx = _mm_mullo_epi16(qx, qx); - qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); + qx0 = _mm_add_epi16(qx0, p); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); @@ -244,14 +243,21 @@ struct MomentsInTile_SIMD qx = _mm_add_epi16(qx, dx); } - _mm_store_si128((__m128i*)buf, qx0); - x0 = buf[0] + buf[1] + buf[2] + buf[3]; - _mm_store_si128((__m128i*)buf, qx1); - x1 = buf[0] + buf[1] + buf[2] + buf[3]; - _mm_store_si128((__m128i*)buf, qx2); - x2 = buf[0] + buf[1] + buf[2] + buf[3]; - _mm_store_si128((__m128i*)buf, qx3); - x3 = buf[0] + buf[1] + buf[2] + buf[3]; + __m128i qx01_lo = _mm_unpacklo_epi32(qx0, qx1); + __m128i qx23_lo = _mm_unpacklo_epi32(qx2, qx3); + __m128i qx01_hi = _mm_unpackhi_epi32(qx0, qx1); + __m128i qx23_hi = _mm_unpackhi_epi32(qx2, qx3); + qx01_lo = _mm_add_epi32(qx01_lo, qx01_hi); + qx23_lo = _mm_add_epi32(qx23_lo, qx23_hi); + __m128i qx0123_lo = _mm_unpacklo_epi64(qx01_lo, qx23_lo); + __m128i qx0123_hi = _mm_unpackhi_epi64(qx01_lo, qx23_lo); + qx0123_lo = _mm_add_epi32(qx0123_lo, qx0123_hi); + _mm_store_si128((__m128i*)buf, qx0123_lo); + + x0 = (buf[0] & 0xffff) + (buf[0] >> 16); + x1 = buf[1]; + x2 = buf[2]; + x3 = buf[3]; } return x;