|
|
|
@ -203,7 +203,7 @@ static Moments contourMoments( const Mat& contour ) |
|
|
|
|
\****************************************************************************************/ |
|
|
|
|
|
|
|
|
|
template<typename T, typename WT, typename MT> |
|
|
|
|
struct MomentsInTile_SSE |
|
|
|
|
struct MomentsInTile_SIMD |
|
|
|
|
{ |
|
|
|
|
int operator() (const T *, int, WT &, WT &, WT &, MT &) |
|
|
|
|
{ |
|
|
|
@ -214,9 +214,9 @@ struct MomentsInTile_SSE |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct MomentsInTile_SSE<uchar, int, int> |
|
|
|
|
struct MomentsInTile_SIMD<uchar, int, int> |
|
|
|
|
{ |
|
|
|
|
MomentsInTile_SSE() |
|
|
|
|
MomentsInTile_SIMD() |
|
|
|
|
{ |
|
|
|
|
useSIMD = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
} |
|
|
|
@ -234,17 +234,16 @@ struct MomentsInTile_SSE<uchar, int, int> |
|
|
|
|
for( ; x <= len - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); |
|
|
|
|
qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); |
|
|
|
|
__m128i px = _mm_mullo_epi16(p, qx); |
|
|
|
|
__m128i sx = _mm_mullo_epi16(qx, qx); |
|
|
|
|
|
|
|
|
|
qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); |
|
|
|
|
qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); |
|
|
|
|
qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); |
|
|
|
|
qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); |
|
|
|
|
qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); |
|
|
|
|
|
|
|
|
|
qx = _mm_add_epi16(qx, dx); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx0); |
|
|
|
|
x0 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, qx1); |
|
|
|
@ -258,17 +257,84 @@ struct MomentsInTile_SSE<uchar, int, int> |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
bool useSIMD; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#elif CV_NEON |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct MomentsInTile_SIMD<uchar, int, int> |
|
|
|
|
{ |
|
|
|
|
MomentsInTile_SIMD() |
|
|
|
|
{ |
|
|
|
|
ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 }; |
|
|
|
|
qx_init = vld1_u16(init); |
|
|
|
|
v_step = vdup_n_u16(4); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z, |
|
|
|
|
v_x2 = v_z, v_x3 = v_z; |
|
|
|
|
uint16x4_t qx = qx_init; |
|
|
|
|
|
|
|
|
|
for( ; x <= len - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x)); |
|
|
|
|
|
|
|
|
|
// first part
|
|
|
|
|
uint32x4_t v_qx = vmovl_u16(qx); |
|
|
|
|
uint16x4_t v_p = vget_low_u16(v_src); |
|
|
|
|
uint32x4_t v_px = vmull_u16(qx, v_p); |
|
|
|
|
|
|
|
|
|
v_x0 = vaddw_u16(v_x0, v_p); |
|
|
|
|
v_x1 = vaddq_u32(v_x1, v_px); |
|
|
|
|
v_px = vmulq_u32(v_px, v_qx); |
|
|
|
|
v_x2 = vaddq_u32(v_x2, v_px); |
|
|
|
|
v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); |
|
|
|
|
qx = vadd_u16(qx, v_step); |
|
|
|
|
|
|
|
|
|
// second part
|
|
|
|
|
v_qx = vmovl_u16(qx); |
|
|
|
|
v_p = vget_high_u16(v_src); |
|
|
|
|
v_px = vmull_u16(qx, v_p); |
|
|
|
|
|
|
|
|
|
v_x0 = vaddw_u16(v_x0, v_p); |
|
|
|
|
v_x1 = vaddq_u32(v_x1, v_px); |
|
|
|
|
v_px = vmulq_u32(v_px, v_qx); |
|
|
|
|
v_x2 = vaddq_u32(v_x2, v_px); |
|
|
|
|
v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); |
|
|
|
|
|
|
|
|
|
qx = vadd_u16(qx, v_step); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
vst1q_u32(buf, v_x0); |
|
|
|
|
x0 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
vst1q_u32(buf, v_x1); |
|
|
|
|
x1 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
vst1q_u32(buf, v_x2); |
|
|
|
|
x2 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
vst1q_u32(buf, v_x3); |
|
|
|
|
x3 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
uint CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
uint16x4_t qx_init, v_step; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if CV_SSE4_1 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct MomentsInTile_SSE<ushort, int, int64> |
|
|
|
|
struct MomentsInTile_SIMD<ushort, int, int64> |
|
|
|
|
{ |
|
|
|
|
MomentsInTile_SSE() |
|
|
|
|
MomentsInTile_SIMD() |
|
|
|
|
{ |
|
|
|
|
useSIMD = checkHardwareSupport(CV_CPU_SSE4_1); |
|
|
|
|
} |
|
|
|
@ -302,9 +368,6 @@ struct MomentsInTile_SSE<ushort, int, int64> |
|
|
|
|
v_ix1 = _mm_add_epi32(v_ix1, v_delta); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
int64 CV_DECL_ALIGNED(16) buf64[2]; |
|
|
|
|
|
|
|
|
|
_mm_store_si128((__m128i*)buf, v_x0); |
|
|
|
|
x0 = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
_mm_store_si128((__m128i*)buf, v_x1); |
|
|
|
@ -319,6 +382,8 @@ struct MomentsInTile_SSE<ushort, int, int64> |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
int64 CV_DECL_ALIGNED(16) buf64[2]; |
|
|
|
|
bool useSIMD; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
@ -334,7 +399,7 @@ static void momentsInTile( const Mat& img, double* moments ) |
|
|
|
|
Size size = img.size(); |
|
|
|
|
int x, y; |
|
|
|
|
MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; |
|
|
|
|
MomentsInTile_SSE<T, WT, MT> vop; |
|
|
|
|
MomentsInTile_SIMD<T, WT, MT> vop; |
|
|
|
|
|
|
|
|
|
for( y = 0; y < size.height; y++ ) |
|
|
|
|
{ |
|
|
|
|