@ -41,17 +41,22 @@
# include "precomp.hpp"
# include <float.h>
# include <stdio.h>
# include "lkpyramid.hpp"
namespace cv
namespace
{
typedef short deriv_type ;
static void calcSharrDeriv ( const Mat & src , Mat & dst )
static void calcSharrDeriv ( const cv : : Mat & src , cv : : Mat & dst )
{
using namespace cv ;
using cv : : detail : : deriv_type ;
int rows = src . rows , cols = src . cols , cn = src . channels ( ) , colsn = cols * cn , depth = src . depth ( ) ;
CV_Assert ( depth = = CV_8U ) ;
dst . create ( rows , cols , CV_MAKETYPE ( DataType < deriv_type > : : depth , cn * 2 ) ) ;
# ifdef HAVE_TEGRA_OPTIMIZATION
if ( tegra : : calcSharrDeriv ( src , dst ) )
return ;
# endif
int x , y , delta = ( int ) alignSize ( ( cols + 2 ) * cn , 16 ) ;
AutoBuffer < deriv_type > _tempBuf ( delta * 2 + 64 ) ;
@ -127,373 +132,355 @@ static void calcSharrDeriv(const Mat& src, Mat& dst)
}
}
} //namespace
struct LKTrackerInvoker
{
LKTrackerInvoker ( const Mat & _prevImg , const Mat & _prevDeriv , const Mat & _nextImg ,
cv : : detail : : LKTrackerInvoker : : LKTrackerInvoker (
const Mat & _prevImg , const Mat & _prevDeriv , const Mat & _nextImg ,
const Point2f * _prevPts , Point2f * _nextPts ,
uchar * _status , float * _err ,
Size _winSize , TermCriteria _criteria ,
int _level , int _maxLevel , int _flags , float _minEigThreshold )
{
prevImg = & _prevImg ;
prevDeriv = & _prevDeriv ;
nextImg = & _nextImg ;
prevPts = _prevPts ;
nextPts = _nextPts ;
status = _status ;
err = _err ;
winSize = _winSize ;
criteria = _criteria ;
level = _level ;
maxLevel = _maxLevel ;
flags = _flags ;
minEigThreshold = _minEigThreshold ;
}
{
prevImg = & _prevImg ;
prevDeriv = & _prevDeriv ;
nextImg = & _nextImg ;
prevPts = _prevPts ;
nextPts = _nextPts ;
status = _status ;
err = _err ;
winSize = _winSize ;
criteria = _criteria ;
level = _level ;
maxLevel = _maxLevel ;
flags = _flags ;
minEigThreshold = _minEigThreshold ;
}
void cv : : detail : : LKTrackerInvoker : : operator ( ) ( const BlockedRange & range ) const
{
Point2f halfWin ( ( winSize . width - 1 ) * 0.5f , ( winSize . height - 1 ) * 0.5f ) ;
const Mat & I = * prevImg ;
const Mat & J = * nextImg ;
const Mat & derivI = * prevDeriv ;
int j , cn = I . channels ( ) , cn2 = cn * 2 ;
cv : : AutoBuffer < deriv_type > _buf ( winSize . area ( ) * ( cn + cn2 ) ) ;
int derivDepth = DataType < deriv_type > : : depth ;
Mat IWinBuf ( winSize , CV_MAKETYPE ( derivDepth , cn ) , ( deriv_type * ) _buf ) ;
Mat derivIWinBuf ( winSize , CV_MAKETYPE ( derivDepth , cn2 ) , ( deriv_type * ) _buf + winSize . area ( ) * cn ) ;
void operator ( ) ( const BlockedRange & range ) const
for ( int ptidx = range . begin ( ) ; ptidx < range . end ( ) ; ptidx + + )
{
Point2f halfWin ( ( winSize . width - 1 ) * 0.5f , ( winSize . height - 1 ) * 0.5f ) ;
const Mat & I = * prevImg ;
const Mat & J = * nextImg ;
const Mat & derivI = * prevDeriv ;
int j , cn = I . channels ( ) , cn2 = cn * 2 ;
cv : : AutoBuffer < deriv_type > _buf ( winSize . area ( ) * ( cn + cn2 ) ) ;
int derivDepth = DataType < deriv_type > : : depth ;
Point2f prevPt = prevPts [ ptidx ] * ( float ) ( 1. / ( 1 < < level ) ) ;
Point2f nextPt ;
if ( level = = maxLevel )
{
if ( flags & OPTFLOW_USE_INITIAL_FLOW )
nextPt = nextPts [ ptidx ] * ( float ) ( 1. / ( 1 < < level ) ) ;
else
nextPt = prevPt ;
}
else
nextPt = nextPts [ ptidx ] * 2.f ;
nextPts [ ptidx ] = nextPt ;
Mat IWinBuf ( winSize , CV_MAKETYPE ( derivDepth , cn ) , ( deriv_type * ) _buf ) ;
Mat derivIWinBuf ( winSize , CV_MAKETYPE ( derivDepth , cn2 ) , ( deriv_type * ) _buf + winSize . area ( ) * cn ) ;
Point2i iprevPt , inextPt ;
prevPt - = halfWin ;
iprevPt . x = cvFloor ( prevPt . x ) ;
iprevPt . y = cvFloor ( prevPt . y ) ;
for ( int ptidx = range . begin ( ) ; ptidx < range . end ( ) ; ptidx + + )
if ( iprevPt . x < - winSize . width | | iprevPt . x > = derivI . cols | |
iprevPt . y < - winSize . height | | iprevPt . y > = derivI . rows )
{
Point2f prevPt = prevPts [ ptidx ] * ( float ) ( 1. / ( 1 < < level ) ) ;
Point2f nextPt ;
if ( level = = maxLevel )
if ( level = = 0 )
{
if ( flags & OPTFLOW_USE_INITIAL_FLOW )
nextPt = nextPts [ ptidx ] * ( float ) ( 1. / ( 1 < < level ) ) ;
else
nextPt = prevPt ;
if ( status )
status [ ptidx ] = false ;
if ( err )
err [ ptidx ] = 0 ;
}
else
nextPt = nextPts [ ptidx ] * 2.f ;
nextPts [ ptidx ] = nextPt ;
continue ;
}
float a = prevPt . x - iprevPt . x ;
float b = prevPt . y - iprevPt . y ;
const int W_BITS = 14 , W_BITS1 = 14 ;
const float FLT_SCALE = 1.f / ( 1 < < 20 ) ;
int iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
int iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
int iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
int iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
int dstep = ( int ) ( derivI . step / derivI . elemSize1 ( ) ) ;
int step = ( int ) ( I . step / I . elemSize1 ( ) ) ;
CV_Assert ( step = = ( int ) ( J . step / J . elemSize1 ( ) ) ) ;
float A11 = 0 , A12 = 0 , A22 = 0 ;
# if CV_SSE2
__m128i qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
__m128i qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128i z = _mm_setzero_si128 ( ) ;
__m128i qdelta_d = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 1 ) ) ;
__m128i qdelta = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 5 - 1 ) ) ;
__m128 qA11 = _mm_setzero_ps ( ) , qA12 = _mm_setzero_ps ( ) , qA22 = _mm_setzero_ps ( ) ;
# endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x , y ;
for ( y = 0 ; y < winSize . height ; y + + )
{
const uchar * src = ( const uchar * ) I . data + ( y + iprevPt . y ) * step + iprevPt . x * cn ;
const deriv_type * dsrc = ( const deriv_type * ) derivI . data + ( y + iprevPt . y ) * dstep + iprevPt . x * cn2 ;
Point2i iprevPt , inextPt ;
prevPt - = halfWin ;
iprevPt . x = cvFloor ( prevPt . x ) ;
iprevPt . y = cvFloor ( prevPt . y ) ;
deriv_type * Iptr = ( deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
deriv_type * dIptr = ( deriv_type * ) ( derivIWinBuf . data + y * derivIWinBuf . step ) ;
if ( iprevPt . x < - winSize . width | | iprevPt . x > = derivI . cols | |
iprevPt . y < - winSize . height | | iprevPt . y > = derivI . rows )
x = 0 ;
# if CV_SSE2
for ( ; x < = winSize . width * cn - 4 ; x + = 4 , dsrc + = 4 * 2 , dIptr + = 4 * 2 )
{
if ( level = = 0 )
{
if ( status )
status [ ptidx ] = false ;
if ( err )
err [ ptidx ] = 0 ;
}
continue ;
__m128i v00 , v01 , v10 , v11 , t0 , t1 ;
v00 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x ) ) , z ) ;
v01 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + cn ) ) , z ) ;
v10 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + step ) ) , z ) ;
v11 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + step + cn ) ) , z ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta ) , W_BITS1 - 5 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( Iptr + x ) , _mm_packs_epi32 ( t0 , t0 ) ) ;
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc ) ) ;
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + cn2 ) ) ;
v10 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep ) ) ;
v11 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep + cn2 ) ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta_d ) , W_BITS1 ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta_d ) , W_BITS1 ) ;
v00 = _mm_packs_epi32 ( t0 , t1 ) ; // Ix0 Iy0 Ix1 Iy1 ...
_mm_storeu_si128 ( ( __m128i * ) dIptr , v00 ) ;
t0 = _mm_srai_epi32 ( v00 , 16 ) ; // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32 ( _mm_slli_epi32 ( v00 , 16 ) , 16 ) ; // Ix0 Ix1 Ix2 Ix3
__m128 fy = _mm_cvtepi32_ps ( t0 ) ;
__m128 fx = _mm_cvtepi32_ps ( t1 ) ;
qA22 = _mm_add_ps ( qA22 , _mm_mul_ps ( fy , fy ) ) ;
qA12 = _mm_add_ps ( qA12 , _mm_mul_ps ( fx , fy ) ) ;
qA11 = _mm_add_ps ( qA11 , _mm_mul_ps ( fx , fx ) ) ;
}
# endif
float a = prevPt . x - iprevPt . x ;
float b = prevPt . y - iprevPt . y ;
const int W_BITS = 14 , W_BITS1 = 14 ;
const float FLT_SCALE = 1.f / ( 1 < < 20 ) ;
int iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
int iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
int iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
int iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
for ( ; x < winSize . width * cn ; x + + , dsrc + = 2 , dIptr + = 2 )
{
int ival = CV_DESCALE ( src [ x ] * iw00 + src [ x + cn ] * iw01 +
src [ x + step ] * iw10 + src [ x + step + cn ] * iw11 , W_BITS1 - 5 ) ;
int ixval = CV_DESCALE ( dsrc [ 0 ] * iw00 + dsrc [ cn2 ] * iw01 +
dsrc [ dstep ] * iw10 + dsrc [ dstep + cn2 ] * iw11 , W_BITS1 ) ;
int iyval = CV_DESCALE ( dsrc [ 1 ] * iw00 + dsrc [ cn2 + 1 ] * iw01 + dsrc [ dstep + 1 ] * iw10 +
dsrc [ dstep + cn2 + 1 ] * iw11 , W_BITS1 ) ;
Iptr [ x ] = ( short ) ival ;
dIptr [ 0 ] = ( short ) ixval ;
dIptr [ 1 ] = ( short ) iyval ;
A11 + = ( float ) ( ixval * ixval ) ;
A12 + = ( float ) ( ixval * iyval ) ;
A22 + = ( float ) ( iyval * iyval ) ;
}
}
# if CV_SSE2
float CV_DECL_ALIGNED ( 16 ) A11buf [ 4 ] , A12buf [ 4 ] , A22buf [ 4 ] ;
_mm_store_ps ( A11buf , qA11 ) ;
_mm_store_ps ( A12buf , qA12 ) ;
_mm_store_ps ( A22buf , qA22 ) ;
A11 + = A11buf [ 0 ] + A11buf [ 1 ] + A11buf [ 2 ] + A11buf [ 3 ] ;
A12 + = A12buf [ 0 ] + A12buf [ 1 ] + A12buf [ 2 ] + A12buf [ 3 ] ;
A22 + = A22buf [ 0 ] + A22buf [ 1 ] + A22buf [ 2 ] + A22buf [ 3 ] ;
# endif
A11 * = FLT_SCALE ;
A12 * = FLT_SCALE ;
A22 * = FLT_SCALE ;
float D = A11 * A22 - A12 * A12 ;
float minEig = ( A22 + A11 - std : : sqrt ( ( A11 - A22 ) * ( A11 - A22 ) +
4.f * A12 * A12 ) ) / ( 2 * winSize . width * winSize . height ) ;
if ( err & & ( flags & CV_LKFLOW_GET_MIN_EIGENVALS ) ! = 0 )
err [ ptidx ] = ( float ) minEig ;
if ( minEig < minEigThreshold | | D < FLT_EPSILON )
{
if ( level = = 0 & & status )
status [ ptidx ] = false ;
continue ;
}
D = 1.f / D ;
nextPt - = halfWin ;
Point2f prevDelta ;
for ( j = 0 ; j < criteria . maxCount ; j + + )
{
inextPt . x = cvFloor ( nextPt . x ) ;
inextPt . y = cvFloor ( nextPt . y ) ;
int dstep = ( int ) ( derivI . step / derivI . elemSize1 ( ) ) ;
int step = ( int ) ( I . step / I . elemSize1 ( ) ) ;
CV_Assert ( step = = ( int ) ( J . step / J . elemSize1 ( ) ) ) ;
float A11 = 0 , A12 = 0 , A22 = 0 ;
if ( inextPt . x < - winSize . width | | inextPt . x > = J . cols | |
inextPt . y < - winSize . height | | inextPt . y > = J . rows )
{
if ( level = = 0 & & status )
status [ ptidx ] = false ;
break ;
}
a = nextPt . x - inextPt . x ;
b = nextPt . y - inextPt . y ;
iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
float b1 = 0 , b2 = 0 ;
# if CV_SSE2
__m128i qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
__m128i qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128i z = _mm_setzero_si128 ( ) ;
__m128i qdelta_d = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 1 ) ) ;
__m128i qdelta = _mm_set1_epi32 ( 1 < < ( W_BITS1 - 5 - 1 ) ) ;
__m128 qA11 = _mm_setzero_ps ( ) , qA12 = _mm_setzero_ps ( ) , qA22 = _mm_setzero_ps ( ) ;
qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128 qb0 = _mm_setzero_ps ( ) , qb1 = _mm_setzero_ps ( ) ;
# endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x , y ;
for ( y = 0 ; y < winSize . height ; y + + )
{
const uchar * src = ( const uchar * ) I . data + ( y + iprevPt . y ) * step + iprevPt . x * cn ;
const deriv_type * dsrc = ( const deriv_type * ) derivI . data + ( y + iprevPt . y ) * dstep + iprevPt . x * cn2 ;
deriv_type * Iptr = ( deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
deriv_type * dIptr = ( deriv_type * ) ( derivIWinBuf . data + y * derivIWinBuf . step ) ;
const uchar * Jptr = ( const uchar * ) J . data + ( y + inextPt . y ) * step + inextPt . x * cn ;
const deriv_type * Iptr = ( const deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
const deriv_type * dIptr = ( const deriv_type * ) ( derivIWinBuf . data + y * derivIWinBuf . step ) ;
x = 0 ;
# if CV_SSE2
for ( ; x < = winSize . width * cn - 4 ; x + = 4 , dsrc + = 4 * 2 , dIptr + = 4 * 2 )
for ( ; x < = winSize . width * cn - 8 ; x + = 8 , dIptr + = 8 * 2 )
{
__m128i v00 , v01 , v10 , v11 , t0 , t1 ;
__m128i diff0 = _mm_loadu_si128 ( ( const __m128i * ) ( Iptr + x ) ) , diff1 ;
__m128i v00 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x ) ) , z ) ;
__m128i v01 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + cn ) ) , z ) ;
__m128i v10 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + step ) ) , z ) ;
__m128i v11 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + step + cn ) ) , z ) ;
v00 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x ) ) , z ) ;
v01 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + cn ) ) , z ) ;
v10 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + step ) ) , z ) ;
v11 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src + x + step + cn ) ) , z ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
__m128i t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
__m128i t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta ) , W_BITS1 - 5 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( Iptr + x ) , _mm_packs_epi32 ( t0 , t0 ) ) ;
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc ) ) ;
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + cn2 ) ) ;
v10 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep ) ) ;
v11 = _mm_loadu_si128 ( ( const __m128i * ) ( dsrc + dstep + cn2 ) ) ;
t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta_d ) , W_BITS1 ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta_d ) , W_BITS1 ) ;
v00 = _mm_packs_epi32 ( t0 , t1 ) ; // Ix0 Iy0 Ix1 Iy1 ...
_mm_storeu_si128 ( ( __m128i * ) dIptr , v00 ) ;
t0 = _mm_srai_epi32 ( v00 , 16 ) ; // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32 ( _mm_slli_epi32 ( v00 , 16 ) , 16 ) ; // Ix0 Ix1 Ix2 Ix3
__m128 fy = _mm_cvtepi32_ps ( t0 ) ;
__m128 fx = _mm_cvtepi32_ps ( t1 ) ;
qA22 = _mm_add_ps ( qA22 , _mm_mul_ps ( fy , fy ) ) ;
qA12 = _mm_add_ps ( qA12 , _mm_mul_ps ( fx , fy ) ) ;
qA11 = _mm_add_ps ( qA11 , _mm_mul_ps ( fx , fx ) ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta ) , W_BITS1 - 5 ) ;
diff0 = _mm_subs_epi16 ( _mm_packs_epi32 ( t0 , t1 ) , diff0 ) ;
diff1 = _mm_unpackhi_epi16 ( diff0 , diff0 ) ;
diff0 = _mm_unpacklo_epi16 ( diff0 , diff0 ) ; // It0 It0 It1 It1 ...
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr ) ) ; // Ix0 Iy0 Ix1 Iy1 ...
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr + 8 ) ) ;
v10 = _mm_mullo_epi16 ( v00 , diff0 ) ;
v11 = _mm_mulhi_epi16 ( v00 , diff0 ) ;
v00 = _mm_unpacklo_epi16 ( v10 , v11 ) ;
v10 = _mm_unpackhi_epi16 ( v10 , v11 ) ;
qb0 = _mm_add_ps ( qb0 , _mm_cvtepi32_ps ( v00 ) ) ;
qb1 = _mm_add_ps ( qb1 , _mm_cvtepi32_ps ( v10 ) ) ;
v10 = _mm_mullo_epi16 ( v01 , diff1 ) ;
v11 = _mm_mulhi_epi16 ( v01 , diff1 ) ;
v00 = _mm_unpacklo_epi16 ( v10 , v11 ) ;
v10 = _mm_unpackhi_epi16 ( v10 , v11 ) ;
qb0 = _mm_add_ps ( qb0 , _mm_cvtepi32_ps ( v00 ) ) ;
qb1 = _mm_add_ps ( qb1 , _mm_cvtepi32_ps ( v10 ) ) ;
}
# endif
for ( ; x < winSize . width * cn ; x + + , dsrc + = 2 , d Iptr + = 2 )
for ( ; x < winSize . width * cn ; x + + , dIptr + = 2 )
{
int ival = CV_DESCALE ( src [ x ] * iw00 + src [ x + cn ] * iw01 +
src [ x + step ] * iw10 + src [ x + step + cn ] * iw11 , W_BITS1 - 5 ) ;
int ixval = CV_DESCALE ( dsrc [ 0 ] * iw00 + dsrc [ cn2 ] * iw01 +
dsrc [ dstep ] * iw10 + dsrc [ dstep + cn2 ] * iw11 , W_BITS1 ) ;
int iyval = CV_DESCALE ( dsrc [ 1 ] * iw00 + dsrc [ cn2 + 1 ] * iw01 + dsrc [ dstep + 1 ] * iw10 +
dsrc [ dstep + cn2 + 1 ] * iw11 , W_BITS1 ) ;
Iptr [ x ] = ( short ) ival ;
dIptr [ 0 ] = ( short ) ixval ;
dIptr [ 1 ] = ( short ) iyval ;
A11 + = ( float ) ( ixval * ixval ) ;
A12 + = ( float ) ( ixval * iyval ) ;
A22 + = ( float ) ( iyval * iyval ) ;
int diff = CV_DESCALE ( Jptr [ x ] * iw00 + Jptr [ x + cn ] * iw01 +
Jptr [ x + step ] * iw10 + Jptr [ x + step + cn ] * iw11 ,
W_BITS1 - 5 ) - Iptr [ x ] ;
b1 + = ( float ) ( diff * dIptr [ 0 ] ) ;
b2 + = ( float ) ( diff * dIptr [ 1 ] ) ;
}
}
# if CV_SSE2
float CV_DECL_ALIGNED ( 16 ) A11buf [ 4 ] , A12buf [ 4 ] , A22buf [ 4 ] ;
_mm_store_ps ( A11buf , qA11 ) ;
_mm_store_ps ( A12buf , qA12 ) ;
_mm_store_ps ( A22buf , qA22 ) ;
A11 + = A11buf [ 0 ] + A11buf [ 1 ] + A11buf [ 2 ] + A11buf [ 3 ] ;
A12 + = A12buf [ 0 ] + A12buf [ 1 ] + A12buf [ 2 ] + A12buf [ 3 ] ;
A22 + = A22buf [ 0 ] + A22buf [ 1 ] + A22buf [ 2 ] + A22buf [ 3 ] ;
float CV_DECL_ALIGNED ( 16 ) bbuf [ 4 ] ;
_mm_store_ps ( bbuf , _mm_add_ps ( qb0 , qb1 ) ) ;
b1 + = bbuf [ 0 ] + bbuf [ 2 ] ;
b2 + = bbuf [ 1 ] + bbuf [ 3 ] ;
# endif
b1 * = FLT_SCALE ;
b2 * = FLT_SCALE ;
A11 * = FLT_SCALE ;
A12 * = FLT_SCALE ;
A22 * = FLT_SCALE ;
Point2f delta ( ( float ) ( ( A12 * b2 - A22 * b1 ) * D ) ,
( float ) ( ( A12 * b1 - A11 * b2 ) * D ) ) ;
//delta = -delta;
float D = A11 * A22 - A12 * A12 ;
float minEig = ( A22 + A11 - std : : sqrt ( ( A11 - A22 ) * ( A11 - A22 ) +
4.f * A12 * A12 ) ) / ( 2 * winSize . width * winSize . height ) ;
nextPt + = delta ;
nextPts [ ptidx ] = nextPt + halfWin ;
if ( err & & ( flags & CV_LKFLOW_GET_MIN_EIGENVALS ) ! = 0 )
err [ ptidx ] = ( float ) minEig ;
if ( delta . ddot ( delta ) < = criteria . epsilon )
break ;
if ( minEig < minEigThreshold | | D < FLT_EPSILON )
if ( j > 0 & & std : : abs ( delta . x + prevDelta . x ) < 0.01 & &
std : : abs ( delta . y + prevDelta . y ) < 0.01 )
{
if ( level = = 0 & & status )
status [ ptidx ] = false ;
continue ;
nextPts [ ptidx ] - = delta * 0.5f ;
break ;
}
prevDelta = delta ;
}
if ( status [ ptidx ] & & err & & level = = 0 & & ( flags & CV_LKFLOW_GET_MIN_EIGENVALS ) = = 0 )
{
Point2f nextPt = nextPts [ ptidx ] - halfWin ;
Point inextPt ;
D = 1.f / D ;
nextPt - = halfWin ;
Point2f prevDelta ;
inextPt . x = cvFloor ( nextPt . x ) ;
inextPt . y = cvFloor ( nextPt . y ) ;
for ( j = 0 ; j < criteria . maxCount ; j + + )
if ( inextPt . x < - winSize . width | | inextPt . x > = J . cols | |
inextPt . y < - winSize . height | | inextPt . y > = J . rows )
{
inextPt . x = cvFloor ( nextPt . x ) ;
inextPt . y = cvFloor ( nextPt . y ) ;
if ( inextPt . x < - winSize . width | | inextPt . x > = J . cols | |
inextPt . y < - winSize . height | | inextPt . y > = J . rows )
{
if ( level = = 0 & & status )
status [ ptidx ] = false ;
break ;
}
a = nextPt . x - inextPt . x ;
b = nextPt . y - inextPt . y ;
iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
float b1 = 0 , b2 = 0 ;
# if CV_SSE2
qw0 = _mm_set1_epi32 ( iw00 + ( iw01 < < 16 ) ) ;
qw1 = _mm_set1_epi32 ( iw10 + ( iw11 < < 16 ) ) ;
__m128 qb0 = _mm_setzero_ps ( ) , qb1 = _mm_setzero_ps ( ) ;
# endif
for ( y = 0 ; y < winSize . height ; y + + )
{
const uchar * Jptr = ( const uchar * ) J . data + ( y + inextPt . y ) * step + inextPt . x * cn ;
const deriv_type * Iptr = ( const deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
const deriv_type * dIptr = ( const deriv_type * ) ( derivIWinBuf . data + y * derivIWinBuf . step ) ;
x = 0 ;
# if CV_SSE2
for ( ; x < = winSize . width * cn - 8 ; x + = 8 , dIptr + = 8 * 2 )
{
__m128i diff0 = _mm_loadu_si128 ( ( const __m128i * ) ( Iptr + x ) ) , diff1 ;
__m128i v00 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x ) ) , z ) ;
__m128i v01 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + cn ) ) , z ) ;
__m128i v10 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + step ) ) , z ) ;
__m128i v11 = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( Jptr + x + step + cn ) ) , z ) ;
__m128i t0 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpacklo_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpacklo_epi16 ( v10 , v11 ) , qw1 ) ) ;
__m128i t1 = _mm_add_epi32 ( _mm_madd_epi16 ( _mm_unpackhi_epi16 ( v00 , v01 ) , qw0 ) ,
_mm_madd_epi16 ( _mm_unpackhi_epi16 ( v10 , v11 ) , qw1 ) ) ;
t0 = _mm_srai_epi32 ( _mm_add_epi32 ( t0 , qdelta ) , W_BITS1 - 5 ) ;
t1 = _mm_srai_epi32 ( _mm_add_epi32 ( t1 , qdelta ) , W_BITS1 - 5 ) ;
diff0 = _mm_subs_epi16 ( _mm_packs_epi32 ( t0 , t1 ) , diff0 ) ;
diff1 = _mm_unpackhi_epi16 ( diff0 , diff0 ) ;
diff0 = _mm_unpacklo_epi16 ( diff0 , diff0 ) ; // It0 It0 It1 It1 ...
v00 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr ) ) ; // Ix0 Iy0 Ix1 Iy1 ...
v01 = _mm_loadu_si128 ( ( const __m128i * ) ( dIptr + 8 ) ) ;
v10 = _mm_mullo_epi16 ( v00 , diff0 ) ;
v11 = _mm_mulhi_epi16 ( v00 , diff0 ) ;
v00 = _mm_unpacklo_epi16 ( v10 , v11 ) ;
v10 = _mm_unpackhi_epi16 ( v10 , v11 ) ;
qb0 = _mm_add_ps ( qb0 , _mm_cvtepi32_ps ( v00 ) ) ;
qb1 = _mm_add_ps ( qb1 , _mm_cvtepi32_ps ( v10 ) ) ;
v10 = _mm_mullo_epi16 ( v01 , diff1 ) ;
v11 = _mm_mulhi_epi16 ( v01 , diff1 ) ;
v00 = _mm_unpacklo_epi16 ( v10 , v11 ) ;
v10 = _mm_unpackhi_epi16 ( v10 , v11 ) ;
qb0 = _mm_add_ps ( qb0 , _mm_cvtepi32_ps ( v00 ) ) ;
qb1 = _mm_add_ps ( qb1 , _mm_cvtepi32_ps ( v10 ) ) ;
}
# endif
for ( ; x < winSize . width * cn ; x + + , dIptr + = 2 )
{
int diff = CV_DESCALE ( Jptr [ x ] * iw00 + Jptr [ x + cn ] * iw01 +
Jptr [ x + step ] * iw10 + Jptr [ x + step + cn ] * iw11 ,
W_BITS1 - 5 ) - Iptr [ x ] ;
b1 + = ( float ) ( diff * dIptr [ 0 ] ) ;
b2 + = ( float ) ( diff * dIptr [ 1 ] ) ;
}
}
# if CV_SSE2
float CV_DECL_ALIGNED ( 16 ) bbuf [ 4 ] ;
_mm_store_ps ( bbuf , _mm_add_ps ( qb0 , qb1 ) ) ;
b1 + = bbuf [ 0 ] + bbuf [ 2 ] ;
b2 + = bbuf [ 1 ] + bbuf [ 3 ] ;
# endif
b1 * = FLT_SCALE ;
b2 * = FLT_SCALE ;
Point2f delta ( ( float ) ( ( A12 * b2 - A22 * b1 ) * D ) ,
( float ) ( ( A12 * b1 - A11 * b2 ) * D ) ) ;
//delta = -delta;
nextPt + = delta ;
nextPts [ ptidx ] = nextPt + halfWin ;
if ( delta . ddot ( delta ) < = criteria . epsilon )
break ;
if ( j > 0 & & std : : abs ( delta . x + prevDelta . x ) < 0.01 & &
std : : abs ( delta . y + prevDelta . y ) < 0.01 )
{
nextPts [ ptidx ] - = delta * 0.5f ;
break ;
}
prevDelta = delta ;
if ( status )
status [ ptidx ] = false ;
continue ;
}
if ( status [ ptidx ] & & err & & level = = 0 & & ( flags & CV_LKFLOW_GET_MIN_EIGENVALS ) = = 0 )
float a = nextPt . x - inextPt . x ;
float b = nextPt . y - inextPt . y ;
iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
float errval = 0.f ;
for ( y = 0 ; y < winSize . height ; y + + )
{
Point2f nextPt = nextPts [ ptidx ] - halfWin ;
Point inextPt ;
inextPt . x = cvFloor ( nextPt . x ) ;
inextPt . y = cvFloor ( nextPt . y ) ;
const uchar * Jptr = ( const uchar * ) J . data + ( y + inextPt . y ) * step + inextPt . x * cn ;
const deriv_type * Iptr = ( const deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
if ( inextPt . x < - winSize . width | | inextPt . x > = J . cols | |
inextPt . y < - winSize . height | | inextPt . y > = J . rows )
for ( x = 0 ; x < winSize . width * cn ; x + + )
{
if ( status )
status [ ptidx ] = false ;
continue ;
int diff = CV_DESCALE ( Jptr [ x ] * iw00 + Jptr [ x + cn ] * iw01 +
Jptr [ x + step ] * iw10 + Jptr [ x + step + cn ] * iw11 ,
W_BITS1 - 5 ) - Iptr [ x ] ;
errval + = std : : abs ( ( float ) diff ) ;
}
float a = nextPt . x - inextPt . x ;
float b = nextPt . y - inextPt . y ;
iw00 = cvRound ( ( 1.f - a ) * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw01 = cvRound ( a * ( 1.f - b ) * ( 1 < < W_BITS ) ) ;
iw10 = cvRound ( ( 1.f - a ) * b * ( 1 < < W_BITS ) ) ;
iw11 = ( 1 < < W_BITS ) - iw00 - iw01 - iw10 ;
float errval = 0.f ;
for ( y = 0 ; y < winSize . height ; y + + )
{
const uchar * Jptr = ( const uchar * ) J . data + ( y + inextPt . y ) * step + inextPt . x * cn ;
const deriv_type * Iptr = ( const deriv_type * ) ( IWinBuf . data + y * IWinBuf . step ) ;
for ( x = 0 ; x < winSize . width * cn ; x + + )
{
int diff = CV_DESCALE ( Jptr [ x ] * iw00 + Jptr [ x + cn ] * iw01 +
Jptr [ x + step ] * iw10 + Jptr [ x + step + cn ] * iw11 ,
W_BITS1 - 5 ) - Iptr [ x ] ;
errval + = std : : abs ( ( float ) diff ) ;
}
}
err [ ptidx ] = errval * 1.f / ( 32 * winSize . width * cn * winSize . height ) ;
}
err [ ptidx ] = errval * 1.f / ( 32 * winSize . width * cn * winSize . height ) ;
}
}
const Mat * prevImg ;
const Mat * nextImg ;
const Mat * prevDeriv ;
const Point2f * prevPts ;
Point2f * nextPts ;
uchar * status ;
float * err ;
Size winSize ;
TermCriteria criteria ;
int level ;
int maxLevel ;
int flags ;
float minEigThreshold ;
} ;
}
int cv : : buildOpticalFlowPyramid ( InputArray _img , OutputArrayOfArrays pyramid , Size winSize , int maxLevel , bool withDerivatives ,
int pyrBorder , int derivBorder , bool tryReuseInputImage )
{
@ -503,7 +490,7 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
pyramid . create ( 1 , ( maxLevel + 1 ) * pyrstep , 0 /*type*/ , - 1 , true , 0 ) ;
int derivType = CV_MAKETYPE ( DataType < deriv_type > : : depth , img . channels ( ) * 2 ) ;
int derivType = CV_MAKETYPE ( DataType < cv : : detail : : deriv_type > : : depth , img . channels ( ) * 2 ) ;
//level 0
bool lvl0IsSet = false ;
@ -597,12 +584,8 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
TermCriteria criteria ,
int flags , double minEigThreshold )
{
# ifdef HAVE_TEGRA_OPTIMIZATION
if ( tegra : : calcOpticalFlowPyrLK ( _prevImg , _nextImg , _prevPts , _nextPts , _status , _err , winSize , maxLevel , criteria , flags , minEigThreshold ) )
return ;
# endif
Mat prevPtsMat = _prevPts . getMat ( ) ;
const int derivDepth = DataType < deriv_type > : : depth ;
const int derivDepth = DataType < cv : : detail : : deriv_type > : : depth ;
CV_Assert ( maxLevel > = 0 & & winSize . width > 2 & & winSize . height > 2 ) ;
@ -744,6 +727,12 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
CV_Assert ( prevPyr [ level * lvlStep1 ] . size ( ) = = nextPyr [ level * lvlStep2 ] . size ( ) ) ;
CV_Assert ( prevPyr [ level * lvlStep1 ] . type ( ) = = nextPyr [ level * lvlStep2 ] . type ( ) ) ;
# ifdef HAVE_TEGRA_OPTIMIZATION
typedef tegra : : LKTrackerInvoker < cv : : detail : : LKTrackerInvoker > LKTrackerInvoker ;
# else
typedef cv : : detail : : LKTrackerInvoker LKTrackerInvoker ;
# endif
parallel_for ( BlockedRange ( 0 , npoints ) , LKTrackerInvoker ( prevPyr [ level * lvlStep1 ] , derivI ,
nextPyr [ level * lvlStep2 ] , prevPts , nextPts ,
status , err ,