|
|
|
@ -227,16 +227,15 @@ struct MomentsInTile_SIMD<uchar, int, int> |
|
|
|
|
|
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); |
|
|
|
|
__m128i dx = _mm_set1_epi16(8); |
|
|
|
|
__m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; |
|
|
|
|
__m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); |
|
|
|
|
|
|
|
|
|
for( ; x <= len - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); |
|
|
|
|
__m128i sx = _mm_mullo_epi16(qx, qx); |
|
|
|
|
|
|
|
|
|
qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); |
|
|
|
|
qx0 = _mm_add_epi16(qx0, p); |
|
|
|
|
qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); |
|
|
|
|
qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); |
|
|
|
|
qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); |
|
|
|
@ -244,14 +243,21 @@ struct MomentsInTile_SIMD<uchar, int, int> |
|
|
|
|
qx = _mm_add_epi16(qx, dx); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
_mm_store_si128((__m128i*)buf, qx0); |
|
|
|
|
x0 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx1); |
|
|
|
|
x1 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx2); |
|
|
|
|
x2 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx3); |
|
|
|
|
x3 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
__m128i qx01_lo = _mm_unpacklo_epi32(qx0, qx1); |
|
|
|
|
__m128i qx23_lo = _mm_unpacklo_epi32(qx2, qx3); |
|
|
|
|
__m128i qx01_hi = _mm_unpackhi_epi32(qx0, qx1); |
|
|
|
|
__m128i qx23_hi = _mm_unpackhi_epi32(qx2, qx3); |
|
|
|
|
qx01_lo = _mm_add_epi32(qx01_lo, qx01_hi); |
|
|
|
|
qx23_lo = _mm_add_epi32(qx23_lo, qx23_hi); |
|
|
|
|
__m128i qx0123_lo = _mm_unpacklo_epi64(qx01_lo, qx23_lo); |
|
|
|
|
__m128i qx0123_hi = _mm_unpackhi_epi64(qx01_lo, qx23_lo); |
|
|
|
|
qx0123_lo = _mm_add_epi32(qx0123_lo, qx0123_hi); |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx0123_lo); |
|
|
|
|
|
|
|
|
|
x0 = (buf[0] & 0xffff) + (buf[0] >> 16); |
|
|
|
|
x1 = buf[1]; |
|
|
|
|
x2 = buf[2]; |
|
|
|
|
x3 = buf[3]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|