@ -239,13 +239,12 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
acctype iA11 = 0 , iA12 = 0 , iA22 = 0 ;
float A11 , A12 , A22 ;
# if CV_SSE2
__m128i qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
__m128i qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128i z = _mm_setzero_si128 ( ) ;
__m128i qdelta_d = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 1 ) ) ;
__m128i qdelta = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 5 - 1 ) ) ;
__m128 qA11 = _mm_setzero_ps ( ) , qA12 = _mm_setzero_ps ( ) , qA22 = _mm_setzero_ps ( ) ;
# if CV_SIMD128 && !CV_NEON
v_int16x8 qw0 ( ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) ) ;
v_int16x8 qw1 ( ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) ) ;
v_int32x4 qdelta_d = v_setall_s32 ( 1 < < ( W_BITS1 - 1 ) ) ;
v_int32x4 qdelta = v_setall_s32 ( 1 < < ( W_BITS1 - 5 - 1 ) ) ;
v_float32x4 qA11 = v_setzero_f32 ( ) , qA12 = v_setzero_f32 ( ) , qA22 = v_setzero_f32 ( ) ;
# endif
# if CV_NEON
@ -275,44 +274,75 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
x = 0 ;
# if CV_SSE2
for ( ; x < = winSize . width * cn - 4 ; x + = 4 , dsrc + = 4 * 2 , dIptr + = 4 * 2 )
# if CV_SIMD128 && !CV_NEON
for ( ; x < = winSize . width * cn - 8 ; x + = 8 , dsrc + = 8 * 2 , dIptr + = 8 * 2 )
{
__m128i v00 , v01 , v10 , v11 , t0 , t1 ;
v00 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x ) ) , z ) ;
v01 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + cn ) ) , z ) ;
v10 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + stepI ) ) , z ) ;
v11 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + stepI + cn ) ) , z ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta ) , W_BITS1 - 5 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( Iptr + x ) , _mm_packs_epi32 ( t0 , t0 ) ) ;
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc ) ) ;
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + cn2 ) ) ;
v10 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep ) ) ;
v11 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep + cn2 ) ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta_d ) , W_BITS1 ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta_d ) , W_BITS1 ) ;
v00 = _mm_packs_epi32 ( t0 , t1 ) ; // Ix0 Iy0 Ix1 Iy1 ...
_mm_storeu_si128 ( ( __m128i * ) dIptr , v00 ) ;
t0 = _mm_srai_epi32 ( v00 , 16 ) ; // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32 ( _mm_slli_epi32 ( v00 , 16 ) , 16 ) ; // Ix0 Ix1 Ix2 Ix3
__m128 fy = _mm_cvtepi32_ps ( t0 ) ;
__m128 fx = _mm_cvtepi32_ps ( t1 ) ;
qA22 = _mm_add_ps ( qA22 , _mm_mul_ps ( fy , fy ) ) ;
qA12 = _mm_add_ps ( qA12 , _mm_mul_ps ( fx , fy ) ) ;
qA11 = _mm_add_ps ( qA11 , _mm_mul_ps ( fx , fx ) ) ;
v_int32x4 t0 , t1 ;
v_int16x8 v00 , v01 , v10 , v11 , t00 , t01 , t10 , t11 ;
v00 = v_reinterpret_as_s16 ( v_load_expand ( src + x ) ) ;
v01 = v_reinterpret_as_s16 ( v_load_expand ( src + x + cn ) ) ;
v10 = v_reinterpret_as_s16 ( v_load_expand ( src + x + stepI ) ) ;
v11 = v_reinterpret_as_s16 ( v_load_expand ( src + x + stepI + cn ) ) ;
v_zip ( v00 , v01 , t00 , t01 ) ;
v_zip ( v10 , v11 , t10 , t11 ) ;
t0 = v_dotprod ( t00 , qw0 , qdelta ) + v_dotprod ( t10 , qw1 ) ;
t1 = v_dotprod ( t01 , qw0 , qdelta ) + v_dotprod ( t11 , qw1 ) ;
t0 = t0 > > ( W_BITS1 - 5 ) ;
t1 = t1 > > ( W_BITS1 - 5 ) ;
v_store ( Iptr + x , v_pack ( t0 , t1 ) ) ;
v00 = v_reinterpret_as_s16 ( v_load ( dsrc ) ) ;
v01 = v_reinterpret_as_s16 ( v_load ( dsrc + cn2 ) ) ;
v10 = v_reinterpret_as_s16 ( v_load ( dsrc + dstep ) ) ;
v11 = v_reinterpret_as_s16 ( v_load ( dsrc + dstep + cn2 ) ) ;
v_zip ( v00 , v01 , t00 , t01 ) ;
v_zip ( v10 , v11 , t10 , t11 ) ;
t0 = v_dotprod ( t00 , qw0 , qdelta_d ) + v_dotprod ( t10 , qw1 ) ;
t1 = v_dotprod ( t01 , qw0 , qdelta_d ) + v_dotprod ( t11 , qw1 ) ;
t0 = t0 > > W_BITS1 ;
t1 = t1 > > W_BITS1 ;
v00 = v_pack ( t0 , t1 ) ; // Ix0 Iy0 Ix1 Iy1 ...
v_store ( dIptr , v00 ) ;
v00 = v_reinterpret_as_s16 ( v_interleave_pairs ( v_reinterpret_as_s32 ( v_interleave_pairs ( v00 ) ) ) ) ;
v_expand ( v00 , t1 , t0 ) ;
v_float32x4 fy = v_cvt_f32 ( t0 ) ;
v_float32x4 fx = v_cvt_f32 ( t1 ) ;
qA22 = v_muladd ( fy , fy , qA22 ) ;
qA12 = v_muladd ( fx , fy , qA12 ) ;
qA11 = v_muladd ( fx , fx , qA11 ) ;
v00 = v_reinterpret_as_s16 ( v_load ( dsrc + 4 * 2 ) ) ;
v01 = v_reinterpret_as_s16 ( v_load ( dsrc + 4 * 2 + cn2 ) ) ;
v10 = v_reinterpret_as_s16 ( v_load ( dsrc + 4 * 2 + dstep ) ) ;
v11 = v_reinterpret_as_s16 ( v_load ( dsrc + 4 * 2 + dstep + cn2 ) ) ;
v_zip ( v00 , v01 , t00 , t01 ) ;
v_zip ( v10 , v11 , t10 , t11 ) ;
t0 = v_dotprod ( t00 , qw0 , qdelta_d ) + v_dotprod ( t10 , qw1 ) ;
t1 = v_dotprod ( t01 , qw0 , qdelta_d ) + v_dotprod ( t11 , qw1 ) ;
t0 = t0 > > W_BITS1 ;
t1 = t1 > > W_BITS1 ;
v00 = v_pack ( t0 , t1 ) ; // Ix0 Iy0 Ix1 Iy1 ...
v_store ( dIptr + 4 * 2 , v00 ) ;
v00 = v_reinterpret_as_s16 ( v_interleave_pairs ( v_reinterpret_as_s32 ( v_interleave_pairs ( v00 ) ) ) ) ;
v_expand ( v00 , t1 , t0 ) ;
fy = v_cvt_f32 ( t0 ) ;
fx = v_cvt_f32 ( t1 ) ;
qA22 = v_muladd ( fy , fy , qA22 ) ;
qA12 = v_muladd ( fx , fy , qA12 ) ;
qA11 = v_muladd ( fx , fx , qA11 ) ;
}
# endif
@ -419,14 +449,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
}
}
# if CV_SSE2
float CV_DECL_ALIGNED ( 16 ) A11buf [ 4 ] , A12buf [ 4 ] , A22buf [ 4 ] ;
_mm_store_ps ( A11buf , qA11 ) ;
_mm_store_ps ( A12buf , qA12 ) ;
_mm_store_ps ( A22buf , qA22 ) ;
iA11 + = A11buf [ 0 ] + A11buf [ 1 ] + A11buf [ 2 ] + A11buf [ 3 ] ;
iA12 + = A12buf [ 0 ] + A12buf [ 1 ] + A12buf [ 2 ] + A12buf [ 3 ] ;
iA22 + = A22buf [ 0 ] + A22buf [ 1 ] + A22buf [ 2 ] + A22buf [ 3 ] ;
# if CV_SIMD128 && !CV_NEON
iA11 + = v_reduce_sum ( qA11 ) ;
iA12 + = v_reduce_sum ( qA12 ) ;
iA22 + = v_reduce_sum ( qA22 ) ;
# endif
# if CV_NEON
@ -479,10 +505,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
acctype ib1 = 0 , ib2 = 0 ;
float b1 , b2 ;
# if CV_SSE2
qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128 qb0 = _mm_setzero_ps ( ) , qb1 = _mm_setzero_ps ( ) ;
# if CV_SIMD128 && !CV_NEON
qw0 = v_int16x8 ( ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) , ( short ) ( iw00 ) , ( short ) ( iw01 ) ) ;
qw1 = v_int16x8 ( ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) , ( short ) ( iw10 ) , ( short ) ( iw11 ) ) ;
v_float32x4 qb0 = v_setzero_f32 ( ) , qb1 = v_setzero_f32 ( ) ;
# endif
# if CV_NEON
@ -503,34 +529,32 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
x = 0 ;
# if CV_SSE2
# if CV_SIMD128 && !CV_NEON
for ( ; x < = winSize . width * cn - 8 ; x + = 8 , dIptr + = 8 * 2 )
{
__m128i diff0 = _mm_loadu_si128 ( ( const __m128i * ) ( Iptr + x ) ) , diff1 ;
__m128i v00 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x ) ) , z ) ;
__m128i v01 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + cn ) ) , z ) ;
__m128i v10 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + stepJ ) ) , z ) ;
__m128i v11 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + stepJ + cn ) ) , z ) ;
__m128i t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
__m128i t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta ) , W_BITS1 - 5 ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta ) , W_BITS1 - 5 ) ;
diff0 = _mm_subs_epi16 ( _mm_packs_epi32 ( t0 , t1 ) , diff0 ) ;
diff1 = _mm_unpackhi_epi16 ( diff0 , diff0 ) ;
diff0 = _mm_unpacklo_epi16 ( diff0 , diff0 ) ; // It0 It0 It1 It1 ...
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr ) ) ; // Ix0 Iy0 Ix1 Iy1 ...
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr + 8 ) ) ;
v10 = _mm_unpacklo_epi16 ( v00 , v01 ) ;
v11 = _mm_unpackhi_epi16 ( v00 , v01 ) ;
v00 = _mm_unpacklo_epi16 ( diff0 , diff1 ) ;
v01 = _mm_unpackhi_epi16 ( diff0 , diff1 ) ;
v00 = _mm_madd_epi16 ( v00 , v10 ) ;
v11 = _mm_madd_epi16 ( v01 , v11 ) ;
qb0 = _mm_add_ps ( qb0 , _mm_cvtepi32_ps ( v00 ) ) ;
qb1 = _mm_add_ps ( qb1 , _mm_cvtepi32_ps ( v11 ) ) ;
v_int16x8 diff0 = v_reinterpret_as_s16 ( v_load ( Iptr + x ) ) , diff1 , diff2 ;
v_int16x8 v00 = v_reinterpret_as_s16 ( v_load_expand ( Jptr + x ) ) ;
v_int16x8 v01 = v_reinterpret_as_s16 ( v_load_expand ( Jptr + x + cn ) ) ;
v_int16x8 v10 = v_reinterpret_as_s16 ( v_load_expand ( Jptr + x + stepJ ) ) ;
v_int16x8 v11 = v_reinterpret_as_s16 ( v_load_expand ( Jptr + x + stepJ + cn ) ) ;
v_int32x4 t0 , t1 ;
v_int16x8 t00 , t01 , t10 , t11 ;
v_zip ( v00 , v01 , t00 , t01 ) ;
v_zip ( v10 , v11 , t10 , t11 ) ;
t0 = v_dotprod ( t00 , qw0 , qdelta ) + v_dotprod ( t10 , qw1 ) ;
t1 = v_dotprod ( t01 , qw0 , qdelta ) + v_dotprod ( t11 , qw1 ) ;
t0 = t0 > > ( W_BITS1 - 5 ) ;
t1 = t1 > > ( W_BITS1 - 5 ) ;
diff0 = v_pack ( t0 , t1 ) - diff0 ;
v_zip ( diff0 , diff0 , diff2 , diff1 ) ; // It0 It0 It1 It1 ...
v00 = v_reinterpret_as_s16 ( v_load ( dIptr ) ) ; // Ix0 Iy0 Ix1 Iy1 ...
v01 = v_reinterpret_as_s16 ( v_load ( dIptr + 8 ) ) ;
v_zip ( v00 , v01 , v10 , v11 ) ;
v_zip ( diff2 , diff1 , v00 , v01 ) ;
qb0 + = v_cvt_f32 ( v_dotprod ( v00 , v10 ) ) ;
qb1 + = v_cvt_f32 ( v_dotprod ( v01 , v11 ) ) ;
}
# endif
@ -616,11 +640,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
}
}
# if CV_SSE2
float CV_DECL_ALIGNED ( 16 ) bbuf [ 4 ] ;
_mm_store_ps ( bbuf , _mm_add_ps ( qb0 , qb1 ) ) ;
ib1 + = bbuf [ 0 ] + bbuf [ 2 ] ;
ib2 + = bbuf [ 1 ] + bbuf [ 3 ] ;
# if CV_SIMD128 && !CV_NEON
v_float32x4 qf0 , qf1 ;
v_recombine ( v_interleave_pairs ( qb0 + qb1 ) , v_setzero_f32 ( ) , qf0 , qf1 ) ;
ib1 + = v_reduce_sum ( qf0 ) ;
ib2 + = v_reduce_sum ( qf1 ) ;
# endif
# if CV_NEON