@ -150,7 +150,7 @@ void findScaleSpaceExtrema(
void calcSIFTDescriptor (
const Mat & img , Point2f ptf , float ori , float scl ,
int d , int n , float * dst
int d , int n , Mat & dst , int row
) ;
@ -555,7 +555,7 @@ void findScaleSpaceExtrema(
void calcSIFTDescriptor (
const Mat & img , Point2f ptf , float ori , float scl ,
int d , int n , float * dst
int d , int n , Mat & dstMat , int row
)
{
CV_TRACE_FUNCTION ( ) ;
@ -575,9 +575,18 @@ void calcSIFTDescriptor(
int i , j , k , len = ( radius * 2 + 1 ) * ( radius * 2 + 1 ) , histlen = ( d + 2 ) * ( d + 2 ) * ( n + 2 ) ;
int rows = img . rows , cols = img . cols ;
AutoBuffer < float > buf ( len * 6 + histlen ) ;
float * X = buf . data ( ) , * Y = X + len , * Mag = Y , * Ori = Mag + len , * W = Ori + len ;
float * RBin = W + len , * CBin = RBin + len , * hist = CBin + len ;
cv : : utils : : BufferArea area ;
float * X = 0 , * Y = 0 , * Mag , * Ori = 0 , * W = 0 , * RBin = 0 , * CBin = 0 , * hist = 0 , * rawDst = 0 ;
area . allocate ( X , len , CV_SIMD_WIDTH ) ;
area . allocate ( Y , len , CV_SIMD_WIDTH ) ;
area . allocate ( Ori , len , CV_SIMD_WIDTH ) ;
area . allocate ( W , len , CV_SIMD_WIDTH ) ;
area . allocate ( RBin , len , CV_SIMD_WIDTH ) ;
area . allocate ( CBin , len , CV_SIMD_WIDTH ) ;
area . allocate ( hist , histlen , CV_SIMD_WIDTH ) ;
area . allocate ( rawDst , len , CV_SIMD_WIDTH ) ;
area . commit ( ) ;
Mag = Y ;
for ( i = 0 ; i < d + 2 ; i + + )
{
@ -628,10 +637,10 @@ void calcSIFTDescriptor(
const v_int32 __n_plus_2 = vx_setall_s32 ( n + 2 ) ;
for ( ; k < = len - vecsize ; k + = vecsize )
{
v_float32 rbin = vx_load ( RBin + k ) ;
v_float32 cbin = vx_load ( CBin + k ) ;
v_float32 obin = ( vx_load ( Ori + k ) - __ori ) * __bins_per_rad ;
v_float32 mag = vx_load ( Mag + k ) * vx_load ( W + k ) ;
v_float32 rbin = vx_load_aligned ( RBin + k ) ;
v_float32 cbin = vx_load_aligned ( CBin + k ) ;
v_float32 obin = ( vx_load_aligned ( Ori + k ) - __ori ) * __bins_per_rad ;
v_float32 mag = vx_load_aligned ( Mag + k ) * vx_load_aligne d ( W + k ) ;
v_int32 r0 = v_floor ( rbin ) ;
v_int32 c0 = v_floor ( cbin ) ;
@ -723,7 +732,7 @@ void calcSIFTDescriptor(
hist [ idx ] + = hist [ idx + n ] ;
hist [ idx + 1 ] + = hist [ idx + n + 1 ] ;
for ( k = 0 ; k < n ; k + + )
d st[ ( i * d + j ) * n + k ] = hist [ idx + k ] ;
rawD st[ ( i * d + j ) * n + k ] = hist [ idx + k ] ;
}
// copy histogram to the descriptor,
// apply hysteresis thresholding
@ -735,17 +744,17 @@ void calcSIFTDescriptor(
# if CV_SIMD
{
v_float32 __nrm2 = vx_setzero_f32 ( ) ;
v_float32 __d st ;
v_float32 __rawD st ;
for ( ; k < = len - v_float32 : : nlanes ; k + = v_float32 : : nlanes )
{
__dst = vx_load ( d st + k ) ;
__nrm2 = v_fma ( __dst , __d st , __nrm2 ) ;
__rawDst = vx_load_aligned ( rawD st + k ) ;
__nrm2 = v_fma ( __rawDst , __rawD st , __nrm2 ) ;
}
nrm2 = ( float ) v_reduce_sum ( __nrm2 ) ;
}
# endif
for ( ; k < len ; k + + )
nrm2 + = d st[ k ] * d st[ k ] ;
nrm2 + = rawD st[ k ] * rawD st[ k ] ;
float thr = std : : sqrt ( nrm2 ) * SIFT_DESCR_MAG_THR ;
@ -760,9 +769,9 @@ void calcSIFTDescriptor(
__m256 __thr = _mm256_set1_ps ( thr ) ;
for ( ; i < = len - 8 ; i + = 8 )
{
__dst = _mm256_loadu_ps ( & d st[ i ] ) ;
__dst = _mm256_loadu_ps ( & rawD st[ i ] ) ;
__dst = _mm256_min_ps ( __dst , __thr ) ;
_mm256_storeu_ps ( & d st[ i ] , __dst ) ;
_mm256_storeu_ps ( & rawD st[ i ] , __dst ) ;
# if CV_FMA3
__nrm2 = _mm256_fmadd_ps ( __dst , __dst , __nrm2 ) ;
# else
@ -776,44 +785,78 @@ void calcSIFTDescriptor(
# endif
for ( ; i < len ; i + + )
{
float val = std : : min ( d st[ i ] , thr ) ;
d st[ i ] = val ;
float val = std : : min ( rawD st[ i ] , thr ) ;
rawD st[ i ] = val ;
nrm2 + = val * val ;
}
nrm2 = SIFT_INT_DESCR_FCTR / std : : max ( std : : sqrt ( nrm2 ) , FLT_EPSILON ) ;
# if 1
k = 0 ;
if ( dstMat . type ( ) = = CV_32F )
{
float * dst = dstMat . ptr < float > ( row ) ;
# if CV_SIMD
v_float32 __dst ;
v_float32 __min = vx_setzero_f32 ( ) ;
v_float32 __max = vx_setall_f32 ( 255.0f ) ; // max of uchar
v_float32 __nrm2 = vx_setall_f32 ( nrm2 ) ;
for ( k = 0 ; k < = len - v_float32 : : nlanes ; k + = v_float32 : : nlanes )
{
v_float32 __dst ;
v_float32 __min = vx_setzero_f32 ( ) ;
v_float32 __max = vx_setall_f32 ( 255.0f ) ; // max of uchar
v_float32 __nrm2 = vx_setall_f32 ( nrm2 ) ;
for ( k = 0 ; k < = len - v_float32 : : nlanes ; k + = v_float32 : : nlanes )
{
__dst = vx_load ( dst + k ) ;
__dst = v_min ( v_max ( v_cvt_f32 ( v_round ( __dst * __nrm2 ) ) , __min ) , __max ) ;
v_store ( dst + k , __dst ) ;
}
__dst = vx_load_aligned ( rawDst + k ) ;
__dst = v_min ( v_max ( v_cvt_f32 ( v_round ( __dst * __nrm2 ) ) , __min ) , __max ) ;
v_store ( dst + k , __dst ) ;
}
# endif
for ( ; k < len ; k + + )
{
dst [ k ] = saturate_cast < uchar > ( d st[ k ] * nrm2 ) ;
dst [ k ] = saturate_cast < uchar > ( rawDst [ k ] * nrm2 ) ;
}
}
else // CV_8U
{
uint8_t * dst = dstMat . ptr < uint8_t > ( row ) ;
# if CV_SIMD
v_float32 __dst0 , __dst1 ;
v_uint16 __pack01 ;
v_float32 __nrm2 = vx_setall_f32 ( nrm2 ) ;
for ( k = 0 ; k < = len - v_float32 : : nlanes * 2 ; k + = v_float32 : : nlanes * 2 )
{
__dst0 = vx_load_aligned ( rawDst + k ) ;
__dst1 = vx_load_aligned ( rawDst + k + v_float32 : : nlanes ) ;
__pack01 = v_pack_u ( v_round ( __dst0 * __nrm2 ) , v_round ( __dst1 * __nrm2 ) ) ;
v_pack_store ( dst + k , __pack01 ) ;
}
# endif
for ( ; k < len ; k + + )
{
dst [ k ] = saturate_cast < uchar > ( rawDst [ k ] * nrm2 ) ;
}
}
# else
float * dst = dstMat . ptr < float > ( row ) ;
float nrm1 = 0 ;
for ( k = 0 ; k < len ; k + + )
{
dst [ k ] * = nrm2 ;
nrm1 + = dst [ k ] ;
rawD st[ k ] * = nrm2 ;
nrm1 + = rawD st[ k ] ;
}
nrm1 = 1.f / std : : max ( nrm1 , FLT_EPSILON ) ;
if ( dstMat . type ( ) = = CV_32F )
{
for ( k = 0 ; k < len ; k + + )
{
dst [ k ] = std : : sqrt ( dst [ k ] * nrm1 ) ; //saturate_cast<uchar>(std::sqrt(dst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
dst [ k ] = std : : sqrt ( rawD st[ k ] * nrm1 ) ;
}
}
else // CV_8U
{
for ( k = 0 ; k < len ; k + + )
{
dst [ k ] = saturate_cast < uchar > ( std : : sqrt ( rawDst [ k ] * nrm1 ) * SIFT_INT_DESCR_FCTR ) ;
}
}
# endif
}