@ -395,7 +395,7 @@ struct AccW_SIMD<double, double>
return x ;
}
} ;
# elif CV_SSE2
# elif CV_SIMD128
template < >
struct Acc_SIMD < float , float >
{
@ -408,8 +408,8 @@ struct Acc_SIMD<float, float>
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
_mm_storeu_ps ( dst + x , _mm_add_ps ( _mm_loadu_ps ( dst + x ) , _mm_loadu_ps ( src + x ) ) ) ;
_mm_storeu_ps ( dst + x + 4 , _mm_add_ps ( _mm_loadu_ps ( dst + x + 4 ) , _mm_loadu_ps ( src + x + 4 ) ) ) ;
v_store ( dst + x , v_load ( dst + x ) + v_load ( src + x ) ) ;
v_store ( dst + x + 4 , v_load ( dst + x + 4 ) + v_load ( src + x + 4 ) ) ;
}
}
@ -417,6 +417,7 @@ struct Acc_SIMD<float, float>
}
} ;
# if CV_SIMD128_64F
template < >
struct Acc_SIMD < float , double >
{
@ -429,17 +430,12 @@ struct Acc_SIMD<float, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128 v_src = _mm_loadu_ps ( src + x ) ;
__m128d v_src0 = _mm_cvtps_pd ( v_src ) ;
__m128d v_src1 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_src , v_src , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ) ;
v_float32x4 v_src = v_load ( src + x ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_src ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_src ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x , v_load ( dst + x ) + v_src0 ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + v_src1 ) ;
}
}
return x ;
@ -458,21 +454,17 @@ struct Acc_SIMD<double, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128d v_src0 = _mm_loadu_p d( src + x ) ;
__m128d v_src1 = _mm_loadu_p d( src + x + 2 ) ;
v_float64x2 v_src0 = v_loa d( src + x ) ;
v_float64x2 v_src1 = v_loa d( src + x + 2 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x , v_load ( dst + x ) + v_src0 ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + v_src1 ) ;
}
}
return x ;
}
} ;
# endif //CV_SIMD128_64F
template < >
struct AccSqr_SIMD < float , float >
@ -486,12 +478,13 @@ struct AccSqr_SIMD<float, float>
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128 v_src0 = _mm_loadu_ps ( src + x ) ;
__m128 v_src1 = _mm_loadu_ps ( src + x + 4 ) ;
v_src0 = _mm_mul_ps ( v_src0 , v_src0 ) ;
v_src1 = _mm_mul_ps ( v_src1 , v_src1 ) ;
_mm_storeu_ps ( dst + x , _mm_add_ps ( _mm_loadu_ps ( dst + x ) , v_src0 ) ) ;
_mm_storeu_ps ( dst + x + 4 , _mm_add_ps ( _mm_loadu_ps ( dst + x + 4 ) , v_src1 ) ) ;
v_float32x4 v_src0 = v_load ( src + x ) ;
v_float32x4 v_src1 = v_load ( src + x + 4 ) ;
v_src0 = v_src0 * v_src0 ;
v_src1 = v_src1 * v_src1 ;
v_store ( dst + x , v_load ( dst + x ) + v_src0 ) ;
v_store ( dst + x + 4 , v_load ( dst + x + 4 ) + v_src1 ) ;
}
}
@ -499,6 +492,7 @@ struct AccSqr_SIMD<float, float>
}
} ;
# if CV_SIMD128_64F
template < >
struct AccSqr_SIMD < float , double >
{
@ -511,19 +505,14 @@ struct AccSqr_SIMD<float, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128 v_src = _mm_loadu_ps ( src + x ) ;
__m128d v_src0 = _mm_cvtps_pd ( v_src ) ;
__m128d v_src1 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_src , v_src , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ) ;
v_src0 = _mm_mul_pd ( v_src0 , v_src0 ) ;
v_src1 = _mm_mul_pd ( v_src1 , v_src1 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_float32x4 v_src = v_load ( src + x ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_src ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_src ) ;
v_src0 = v_src0 * v_src0 ;
v_src1 = v_src1 * v_src1 ;
v_store ( dst + x , v_load ( dst + x ) + v_src0 ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + v_src1 ) ;
}
}
return x ;
@ -542,23 +531,19 @@ struct AccSqr_SIMD<double, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128d v_src0 = _mm_loadu_pd ( src + x ) ;
__m128d v_src1 = _mm_loadu_pd ( src + x + 2 ) ;
v_src0 = _mm_mul_pd ( v_src0 , v_src0 ) ;
v_src1 = _mm_mul_pd ( v_src1 , v_src1 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_float64x2 v_src0 = v_load ( src + x ) ;
v_float64x2 v_src1 = v_load ( src + x + 2 ) ;
v_src0 = v_src0 * v_src0 ;
v_src1 = v_src1 * v_src1 ;
v_store ( dst + x , v_load ( dst + x ) + v_src0 ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + v_src1 ) ;
}
}
return x ;
}
} ;
# endif //CV_SIMD128_64F
template < >
struct AccProd_SIMD < float , float >
@ -572,8 +557,8 @@ struct AccProd_SIMD<float, float>
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
_mm_storeu_ps ( dst + x , _mm_add_ps ( _mm_loadu_ps ( dst + x ) , _mm_mul_ps ( _mm_loadu_ps ( src1 + x ) , _mm_loadu_ps ( src2 + x ) ) ) ) ;
_mm_storeu_ps ( dst + x + 4 , _mm_add_ps ( _mm_loadu_ps ( dst + x + 4 ) , _mm_mul_ps ( _mm_loadu_ps ( src1 + x + 4 ) , _mm_loadu_ps ( src2 + x + 4 ) ) ) ) ;
v_store ( dst + x , v_load ( dst + x ) + v_load ( src1 + x ) * v_load ( src2 + x ) ) ;
v_store ( dst + x + 4 , v_load ( dst + x + 4 ) + v_load ( src1 + x + 4 ) * v_load ( src2 + x + 4 ) ) ;
}
}
@ -581,6 +566,7 @@ struct AccProd_SIMD<float, float>
}
} ;
# if CV_SIMD128_64F
template < >
struct AccProd_SIMD < float , double >
{
@ -593,22 +579,16 @@ struct AccProd_SIMD<float, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128 v_1src = _mm_loadu_ps ( src1 + x ) ;
__m128 v_2src = _mm_loadu_ps ( src2 + x ) ;
__m128d v_1src0 = _mm_cvtps_pd ( v_1src ) ;
__m128d v_1src1 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_1src , v_1src , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_2src0 = _mm_cvtps_pd ( v_2src ) ;
__m128d v_2src1 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_2src , v_2src , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
v_float32x4 v_1src = v_load ( src1 + x ) ;
v_float32x4 v_2src = v_load ( src2 + x ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_float64x2 v_1src0 = v_cvt_f64 ( v_1src ) ;
v_float64x2 v_1src1 = v_cvt_f64_high ( v_1src ) ;
v_float64x2 v_2src0 = v_cvt_f64 ( v_2src ) ;
v_float64x2 v_2src1 = v_cvt_f64_high ( v_2src ) ;
v_dst0 = _mm_add_pd ( v_dst0 , _mm_mul_pd ( v_1src0 , v_2src0 ) ) ;
v_dst1 = _mm_add_pd ( v_dst1 , _mm_mul_pd ( v_1src1 , v_2src1 ) ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x , v_load ( dst + x ) + ( v_1src0 * v_2src0 ) ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + ( v_1src1 * v_2src1 ) ) ;
}
}
return x ;
@ -627,25 +607,19 @@ struct AccProd_SIMD<double, double>
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128d v_src00 = _mm_loadu_pd ( src1 + x ) ;
__m128d v_src01 = _mm_loadu_pd ( src1 + x + 2 ) ;
__m128d v_src10 = _mm_loadu_pd ( src2 + x ) ;
__m128d v_src11 = _mm_loadu_pd ( src2 + x + 2 ) ;
__m128d v_src0 = _mm_mul_pd ( v_src00 , v_src10 ) ;
__m128d v_src1 = _mm_mul_pd ( v_src01 , v_src11 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
v_float64x2 v_src00 = v_load ( src1 + x ) ;
v_float64x2 v_src01 = v_load ( src1 + x + 2 ) ;
v_float64x2 v_src10 = v_load ( src2 + x ) ;
v_float64x2 v_src11 = v_load ( src2 + x + 2 ) ;
v_store ( dst + x , v_load ( dst + x ) + ( v_src00 * v_src10 ) ) ;
v_store ( dst + x + 2 , v_load ( dst + x + 2 ) + ( v_src01 * v_src11 ) ) ;
}
}
return x ;
}
} ;
# endif //CV_SIMD128_64F
template < >
struct AccW_SIMD < float , float >
@ -653,16 +627,16 @@ struct AccW_SIMD<float, float>
int operator ( ) ( const float * src , float * dst , const uchar * mask , int len , int cn , float alpha ) const
{
int x = 0 ;
__m128 v_alpha = _mm_set1_ps ( alpha ) ;
__m128 v_beta = _mm_set1_ps ( 1.0f - alpha ) ;
v_float32x4 v_alpha = v_setall_f32 ( alpha ) ;
v_float32x4 v_beta = v_setall_f32 ( 1.0f - alpha ) ;
if ( ! mask )
{
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
_mm_storeu_ps ( dst + x , _mm_add_ps ( _mm_mul_ps ( _mm_loadu_ps ( dst + x ) , v_beta ) , _mm_mul_ps ( _mm_loadu_ps ( src + x ) , v_alpha ) ) ) ;
_mm_storeu_ps ( dst + x + 4 , _mm_add_ps ( _mm_mul_ps ( _mm_loadu_ps ( dst + x + 4 ) , v_beta ) , _mm_mul_ps ( _mm_loadu_ps ( src + x + 4 ) , v_alpha ) ) ) ;
v_store ( dst + x , ( ( v_load ( dst + x ) * v_beta ) + ( v_load ( src + x ) * v_alpha ) ) ) ;
v_store ( dst + x + 4 , ( ( v_load ( dst + x + 4 ) * v_beta ) + ( v_load ( src + x + 4 ) * v_alpha ) ) ) ;
}
}
@ -670,31 +644,32 @@ struct AccW_SIMD<float, float>
}
} ;
# if CV_SIMD128_64F
template < >
struct AccW_SIMD < float , double >
{
int operator ( ) ( const float * src , double * dst , const uchar * mask , int len , int cn , double alpha ) const
{
int x = 0 ;
__m128d v_alpha = _mm_set1_pd ( alpha ) ;
__m128d v_beta = _mm_set1_pd ( 1.0f - alpha ) ;
v_float64x2 v_alpha = v_setall_f64 ( alpha ) ;
v_float64x2 v_beta = v_setall_f64 ( 1.0f - alpha ) ;
if ( ! mask )
{
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128 v_src0 = _mm_loadu_ps ( src + x ) ;
__m128 v_src1 = _mm_loadu_ps ( src + x + 4 ) ;
__m128d v_src00 = _mm_cvtps_pd ( v_src0 ) ;
__m128d v_src01 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_src0 , v_src0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src10 = _mm_cvtps_pd ( v_src1 ) ;
__m128d v_src11 = _mm_cvtps_pd ( _mm_shuffle_ps ( v_src1 , v_src1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
_mm_storeu_pd ( dst + x , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x ) , v_beta ) , _mm_mul_pd ( v_src00 , v_alpha ) ) ) ;
_mm_storeu_pd ( dst + x + 2 , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x + 2 ) , v_beta ) , _mm_mul_pd ( v_src01 , v_alpha ) ) ) ;
_mm_storeu_pd ( dst + x + 4 , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x + 4 ) , v_beta ) , _mm_mul_pd ( v_src10 , v_alpha ) ) ) ;
_mm_storeu_pd ( dst + x + 6 , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x + 6 ) , v_beta ) , _mm_mul_pd ( v_src11 , v_alpha ) ) ) ;
v_float32x4 v_src0 = v_load ( src + x ) ;
v_float32x4 v_src1 = v_load ( src + x + 4 ) ;
v_float64x2 v_src00 = v_cvt_f64 ( v_src0 ) ;
v_float64x2 v_src01 = v_cvt_f64_high ( v_src0 ) ;
v_float64x2 v_src10 = v_cvt_f64 ( v_src1 ) ;
v_float64x2 v_src11 = v_cvt_f64_high ( v_src1 ) ;
v_store ( dst + x , ( ( v_loa d( dst + x ) * v_beta ) + ( v_src00 * v_alpha ) ) ) ;
v_store ( dst + x + 2 , ( ( v_loa d( dst + x + 2 ) * v_beta ) + ( v_src01 * v_alpha ) ) ) ;
v_store ( dst + x + 4 , ( ( v_loa d( dst + x + 4 ) * v_beta ) + ( v_src10 * v_alpha ) ) ) ;
v_store ( dst + x + 6 , ( ( v_loa d( dst + x + 6 ) * v_beta ) + ( v_src11 * v_alpha ) ) ) ;
}
}
@ -708,26 +683,27 @@ struct AccW_SIMD<double, double>
int operator ( ) ( const double * src , double * dst , const uchar * mask , int len , int cn , double alpha ) const
{
int x = 0 ;
__m128d v_alpha = _mm_set1_pd ( alpha ) ;
__m128d v_beta = _mm_set1_pd ( 1.0f - alpha ) ;
v_float64x2 v_alpha = v_setall_f64 ( alpha ) ;
v_float64x2 v_beta = v_setall_f64 ( 1.0f - alpha ) ;
if ( ! mask )
{
len * = cn ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128d v_src0 = _mm_loadu_p d( src + x ) ;
__m128d v_src1 = _mm_loadu_p d( src + x + 2 ) ;
v_float64x2 v_src0 = v_loa d( src + x ) ;
v_float64x2 v_src1 = v_loa d( src + x + 2 ) ;
_mm_storeu_pd ( dst + x , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x ) , v_beta ) , _mm_mul_pd ( v_src0 , v_alpha ) ) ) ;
_mm_storeu_pd ( dst + x + 2 , _mm_add_pd ( _mm_mul_pd ( _mm_loadu_p d( dst + x + 2 ) , v_beta ) , _mm_mul_pd ( v_src1 , v_alpha ) ) ) ;
v_store ( dst + x , ( ( v_loa d( dst + x ) * v_beta ) + ( v_src0 * v_alpha ) ) ) ;
v_store ( dst + x + 2 , ( ( v_loa d( dst + x + 2 ) * v_beta ) + ( v_src1 * v_alpha ) ) ) ;
}
}
return x ;
}
} ;
# endif
# endif //CV_SIMD128_64F
# endif //CV_SIMD128
# if CV_SIMD128
template < >
@ -742,7 +718,7 @@ struct Acc_SIMD<uchar, float>
len * = cn ;
for ( ; x < = len - 16 ; x + = 16 )
{
v_uint8x16 v_src = v_load ( ( const uchar * ) ( src + x ) ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
@ -762,9 +738,9 @@ struct Acc_SIMD<uchar, float>
for ( ; x < = len - 16 ; x + = 16 )
{
v_uint8x16 v_mask = v_load ( ( const uchar * ) ( mask + x ) ) ;
v_uint8x16 v_mask = v_load ( mask + x ) ;
v_mask = ~ ( v_0 = = v_mask ) ;
v_uint8x16 v_src = v_load ( ( const uchar * ) ( src + x ) ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_src = v_src & v_mask ;
v_uint16x8 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
@ -795,7 +771,7 @@ struct Acc_SIMD<ushort, float>
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
v_uint16x8 v_src = v_load ( ( const ushort * ) ( src + x ) ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_uint32x4 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
@ -808,7 +784,7 @@ struct Acc_SIMD<ushort, float>
}
} ;
# if CV_SSE2
# if CV_SIMD128_64F
template < >
struct Acc_SIMD < uchar , double >
{
@ -818,52 +794,52 @@ struct Acc_SIMD<uchar, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 16 ; x + = 16 )
{
__m128i v_src = _mm_loadu_si128 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int0 = _mm_unpacklo_epi8 ( v_src , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi8 ( v_src , v_0 ) ;
__m128i v_int00 = _mm_unpacklo_epi16 ( v_int0 , v_0 ) ;
__m128i v_int01 = _mm_unpackhi_epi16 ( v_int0 , v_0 ) ;
__m128i v_int10 = _mm_unpacklo_epi16 ( v_int1 , v_0 ) ;
__m128i v_int11 = _mm_unpackhi_epi16 ( v_int1 , v_0 ) ;
__m128d v_src0 = _mm_cvtepi32_pd ( v_int00 ) ;
__m128d v_src1 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int00 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src2 = _mm_cvtepi32_pd ( v_int01 ) ;
__m128d v_src3 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int01 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src4 = _mm_cvtepi32_pd ( v_int10 ) ;
__m128d v_src5 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int10 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src6 = _mm_cvtepi32_pd ( v_int11 ) ;
__m128d v_src7 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int11 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
__m128d v_dst4 = _mm_loadu_pd ( dst + x + 8 ) ;
__m128d v_dst5 = _mm_loadu_pd ( dst + x + 10 ) ;
__m128d v_dst6 = _mm_loadu_pd ( dst + x + 12 ) ;
__m128d v_dst7 = _mm_loadu_pd ( dst + x + 14 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
v_dst4 = _mm_add_pd ( v_dst4 , v_src4 ) ;
v_dst5 = _mm_add_pd ( v_dst5 , v_src5 ) ;
v_dst6 = _mm_add_pd ( v_dst6 , v_src6 ) ;
v_dst7 = _mm_add_pd ( v_dst7 , v_src7 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
_mm_storeu_pd ( dst + x + 8 , v_dst4 ) ;
_mm_storeu_pd ( dst + x + 10 , v_dst5 ) ;
_mm_storeu_pd ( dst + x + 12 , v_dst6 ) ;
_mm_storeu_pd ( dst + x + 14 , v_dst7 ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_int0 , v_int1 ;
v_expand ( v_src , v_int0 , v_int1 ) ;
v_uint32x4 v_int00 , v_int01 , v_int10 , v_int11 ;
v_expand ( v_int0 , v_int00 , v_int01 ) ;
v_expand ( v_int1 , v_int10 , v_int11 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int00 ) ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int00 ) ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int01 ) ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int01 ) ) ;
v_float64x2 v_src4 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int10 ) ) ;
v_float64x2 v_src5 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int10 ) ) ;
v_float64x2 v_src6 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int11 ) ) ;
v_float64x2 v_src7 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int11 ) ) ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_float64x2 v_dst4 = v_load ( dst + x + 8 ) ;
v_float64x2 v_dst5 = v_load ( dst + x + 10 ) ;
v_float64x2 v_dst6 = v_load ( dst + x + 12 ) ;
v_float64x2 v_dst7 = v_load ( dst + x + 14 ) ;
v_dst0 = v_dst0 + v_src0 ;
v_dst1 = v_dst1 + v_src1 ;
v_dst2 = v_dst2 + v_src2 ;
v_dst3 = v_dst3 + v_src3 ;
v_dst4 = v_dst4 + v_src4 ;
v_dst5 = v_dst5 + v_src5 ;
v_dst6 = v_dst6 + v_src6 ;
v_dst7 = v_dst7 + v_src7 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
v_store ( dst + x + 8 , v_dst4 ) ;
v_store ( dst + x + 10 , v_dst5 ) ;
v_store ( dst + x + 12 , v_dst6 ) ;
v_store ( dst + x + 14 , v_dst7 ) ;
}
}
return x ;
@ -879,32 +855,32 @@ struct Acc_SIMD<ushort, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_src = _mm_loadu_si128 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int0 = _mm_unpacklo_epi16 ( v_src , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi16 ( v_src , v_0 ) ;
__m128d v_src0 = _mm_cvtepi32_pd ( v_int0 ) ;
__m128d v_src1 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src2 = _mm_cvtepi32_pd ( v_int1 ) ;
__m128d v_src3 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_uint32x4 v_int0 , v_int1 ;
v_expand ( v_src , v_int0 , v_int1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int0 ) ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int0 ) ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int1 ) ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int1 ) ) ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 = v_dst0 + v_src0 ;
v_dst1 = v_dst1 + v_src1 ;
v_dst2 = v_dst2 + v_src2 ;
v_dst3 = v_dst3 + v_src3 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
return x ;
@ -924,7 +900,7 @@ struct AccSqr_SIMD<uchar, float>
len * = cn ;
for ( ; x < = len - 16 ; x + = 16 )
{
v_uint8x16 v_src = v_load ( ( const uchar * ) ( src + x ) ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
v_src0 = v_src0 * v_src0 ;
@ -945,9 +921,9 @@ struct AccSqr_SIMD<uchar, float>
v_uint8x16 v_0 = v_setall_u8 ( 0 ) ;
for ( ; x < = len - 16 ; x + = 16 )
{
v_uint8x16 v_mask = v_load ( ( const uchar * ) ( mask + x ) ) ;
v_uint8x16 v_mask = v_load ( mask + x ) ;
v_mask = ~ ( v_0 = = v_mask ) ;
v_uint8x16 v_src = v_load ( ( const uchar * ) ( src + x ) ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_src = v_src & v_mask ;
v_uint16x8 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
@ -981,7 +957,7 @@ struct AccSqr_SIMD<ushort, float>
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
v_uint16x8 v_src = v_load ( ( const ushort * ) ( src + x ) ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_uint32x4 v_src0 , v_src1 ;
v_expand ( v_src , v_src0 , v_src1 ) ;
@ -1000,7 +976,7 @@ struct AccSqr_SIMD<ushort, float>
}
} ;
# if CV_SSE2
# if CV_SIMD128_64F
template < >
struct AccSqr_SIMD < uchar , double >
{
@ -1010,37 +986,39 @@ struct AccSqr_SIMD<uchar, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_src = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int = _mm_unpacklo_epi8 ( v_src , v_0 ) ;
__m128i v_int0 = _mm_unpacklo_epi16 ( v_int , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi16 ( v_int , v_0 ) ;
__m128d v_src0 = _mm_cvtepi32_pd ( v_int0 ) ;
__m128d v_src1 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src2 = _mm_cvtepi32_pd ( v_int1 ) ;
__m128d v_src3 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
v_src0 = _mm_mul_pd ( v_src0 , v_src0 ) ;
v_src1 = _mm_mul_pd ( v_src1 , v_src1 ) ;
v_src2 = _mm_mul_pd ( v_src2 , v_src2 ) ;
v_src3 = _mm_mul_pd ( v_src3 , v_src3 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_int , dummy ;
v_expand ( v_src , v_int , dummy ) ;
v_uint32x4 v_int0 , v_int1 ;
v_expand ( v_int , v_int0 , v_int1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int0 ) ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int0 ) ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_reinterpret_as_s32 ( v_int1 ) ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_reinterpret_as_s32 ( v_int1 ) ) ;
v_src0 = v_src0 * v_src0 ;
v_src1 = v_src1 * v_src1 ;
v_src2 = v_src2 * v_src2 ;
v_src3 = v_src3 * v_src3 ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 + = v_src0 ;
v_dst1 + = v_src1 ;
v_dst2 + = v_src2 ;
v_dst3 + = v_src3 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
return x ;
@ -1056,36 +1034,39 @@ struct AccSqr_SIMD<ushort, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_src = _mm_loadu_si128 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int0 = _mm_unpacklo_epi16 ( v_src , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi16 ( v_src , v_0 ) ;
__m128d v_src0 = _mm_cvtepi32_pd ( v_int0 ) ;
__m128d v_src1 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src2 = _mm_cvtepi32_pd ( v_int1 ) ;
__m128d v_src3 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
v_src0 = _mm_mul_pd ( v_src0 , v_src0 ) ;
v_src1 = _mm_mul_pd ( v_src1 , v_src1 ) ;
v_src2 = _mm_mul_pd ( v_src2 , v_src2 ) ;
v_src3 = _mm_mul_pd ( v_src3 , v_src3 ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_uint32x4 v_int_0 , v_int_1 ;
v_expand ( v_src , v_int_0 , v_int_1 ) ;
v_int32x4 v_int0 = v_reinterpret_as_s32 ( v_int_0 ) ;
v_int32x4 v_int1 = v_reinterpret_as_s32 ( v_int_1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_int0 ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_int0 ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_int1 ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_int1 ) ;
v_src0 = v_src0 * v_src0 ;
v_src1 = v_src1 * v_src1 ;
v_src2 = v_src2 * v_src2 ;
v_src3 = v_src3 * v_src3 ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 + = v_src0 ;
v_dst1 + = v_src1 ;
v_dst2 + = v_src2 ;
v_dst3 + = v_src3 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
return x ;
@ -1227,7 +1208,7 @@ struct AccProd_SIMD<ushort, float>
}
} ;
# if CV_SSE2
# if CV_SIMD128_64F
template < >
struct AccProd_SIMD < uchar , double >
{
@ -1237,38 +1218,44 @@ struct AccProd_SIMD<uchar, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_1src = _mm_loadl_epi64 ( ( const __m128i * ) ( src1 + x ) ) ;
__m128i v_2src = _mm_loadl_epi64 ( ( const __m128i * ) ( src2 + x ) ) ;
__m128i v_1int = _mm_unpacklo_epi8 ( v_1src , v_0 ) ;
__m128i v_2int = _mm_unpacklo_epi8 ( v_2src , v_0 ) ;
__m128i v_1int0 = _mm_unpacklo_epi16 ( v_1int , v_0 ) ;
__m128i v_1int1 = _mm_unpackhi_epi16 ( v_1int , v_0 ) ;
__m128i v_2int0 = _mm_unpacklo_epi16 ( v_2int , v_0 ) ;
__m128i v_2int1 = _mm_unpackhi_epi16 ( v_2int , v_0 ) ;
__m128d v_src0 = _mm_mul_pd ( _mm_cvtepi32_pd ( v_1int0 ) , _mm_cvtepi32_pd ( v_2int0 ) ) ;
__m128d v_src1 = _mm_mul_pd ( _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_1int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) , _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_2int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ) ;
__m128d v_src2 = _mm_mul_pd ( _mm_cvtepi32_pd ( v_1int1 ) , _mm_cvtepi32_pd ( v_2int1 ) ) ;
__m128d v_src3 = _mm_mul_pd ( _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_1int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) , _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_2int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint8x16 v_1src = v_load ( src1 + x ) ;
v_uint8x16 v_2src = v_load ( src2 + x ) ;
v_uint16x8 v_1int , v_2int , dummy ;
v_expand ( v_1src , v_1int , dummy ) ;
v_expand ( v_2src , v_2int , dummy ) ;
v_uint32x4 v_1int_0 , v_1int_1 , v_2int_0 , v_2int_1 ;
v_expand ( v_1int , v_1int_0 , v_1int_1 ) ;
v_expand ( v_2int , v_2int_0 , v_2int_1 ) ;
v_int32x4 v_1int0 = v_reinterpret_as_s32 ( v_1int_0 ) ;
v_int32x4 v_1int1 = v_reinterpret_as_s32 ( v_1int_1 ) ;
v_int32x4 v_2int0 = v_reinterpret_as_s32 ( v_2int_0 ) ;
v_int32x4 v_2int1 = v_reinterpret_as_s32 ( v_2int_1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_1int0 ) * v_cvt_f64 ( v_2int0 ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_1int0 ) * v_cvt_f64_high ( v_2int0 ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_1int1 ) * v_cvt_f64 ( v_2int1 ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_1int1 ) * v_cvt_f64_high ( v_2int1 ) ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 + = v_src0 ;
v_dst1 + = v_src1 ;
v_dst2 + = v_src2 ;
v_dst3 + = v_src3 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
return x ;
@ -1284,35 +1271,40 @@ struct AccProd_SIMD<ushort, double>
if ( ! mask )
{
__m128i v_0 = _mm_setzero_si128 ( ) ;
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_1src = _mm_loadu_si128 ( ( const __m128i * ) ( src1 + x ) ) ;
__m128i v_2src = _mm_loadu_si128 ( ( const __m128i * ) ( src2 + x ) ) ;
__m128i v_1int0 = _mm_unpacklo_epi16 ( v_1src , v_0 ) ;
__m128i v_1int1 = _mm_unpackhi_epi16 ( v_1src , v_0 ) ;
__m128i v_2int0 = _mm_unpacklo_epi16 ( v_2src , v_0 ) ;
__m128i v_2int1 = _mm_unpackhi_epi16 ( v_2src , v_0 ) ;
__m128d v_src0 = _mm_mul_pd ( _mm_cvtepi32_pd ( v_1int0 ) , _mm_cvtepi32_pd ( v_2int0 ) ) ;
__m128d v_src1 = _mm_mul_pd ( _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_1int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) , _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_2int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ) ;
__m128d v_src2 = _mm_mul_pd ( _mm_cvtepi32_pd ( v_1int1 ) , _mm_cvtepi32_pd ( v_2int1 ) ) ;
__m128d v_src3 = _mm_mul_pd ( _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_1int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) , _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_2int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( v_dst0 , v_src0 ) ;
v_dst1 = _mm_add_pd ( v_dst1 , v_src1 ) ;
v_dst2 = _mm_add_pd ( v_dst2 , v_src2 ) ;
v_dst3 = _mm_add_pd ( v_dst3 , v_src3 ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint16x8 v_1src = v_load ( src1 + x ) ;
v_uint16x8 v_2src = v_load ( src2 + x ) ;
v_uint32x4 v_1int_0 , v_1int_1 , v_2int_0 , v_2int_1 ;
v_expand ( v_1src , v_1int_0 , v_1int_1 ) ;
v_expand ( v_2src , v_2int_0 , v_2int_1 ) ;
v_int32x4 v_1int0 = v_reinterpret_as_s32 ( v_1int_0 ) ;
v_int32x4 v_1int1 = v_reinterpret_as_s32 ( v_1int_1 ) ;
v_int32x4 v_2int0 = v_reinterpret_as_s32 ( v_2int_0 ) ;
v_int32x4 v_2int1 = v_reinterpret_as_s32 ( v_2int_1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_1int0 ) * v_cvt_f64 ( v_2int0 ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_1int0 ) * v_cvt_f64_high ( v_2int0 ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_1int1 ) * v_cvt_f64 ( v_2int1 ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_1int1 ) * v_cvt_f64_high ( v_2int1 ) ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 = v_dst0 + v_src0 ;
v_dst1 = v_dst1 + v_src1 ;
v_dst2 = v_dst2 + v_src2 ;
v_dst3 = v_dst3 + v_src3 ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
return x ;
@ -1399,45 +1391,50 @@ struct AccW_SIMD<ushort, float>
}
} ;
# if CV_SSE2
# if CV_SIMD128_64F
template < >
struct AccW_SIMD < uchar , double >
{
int operator ( ) ( const uchar * src , double * dst , const uchar * mask , int len , int cn , double alpha ) const
{
int x = 0 ;
__m128d v_alpha = _mm_set1_pd ( alpha ) ;
__m128d v_beta = _mm_set1_pd ( 1.0f - alpha ) ;
__m128i v_0 = _mm_setzero_si128 ( ) ;
v_float64x2 v_alpha = v_setall_f64 ( alpha ) ;
v_float64x2 v_beta = v_setall_f64 ( 1.0f - alpha ) ;
if ( ! mask )
{
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_src = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int = _mm_unpacklo_epi8 ( v_src , v_0 ) ;
__m128i v_int0 = _mm_unpacklo_epi16 ( v_int , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi16 ( v_int , v_0 ) ;
__m128d v_src0 = _mm_cvtepi32_pd ( v_int0 ) ;
__m128d v_src1 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src2 = _mm_cvtepi32_pd ( v_int1 ) ;
__m128d v_src3 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_dst0 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst1 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst2 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst3 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst0 = _mm_add_pd ( _mm_mul_pd ( v_dst0 , v_beta ) , _mm_mul_pd ( v_src0 , v_alpha ) ) ;
v_dst1 = _mm_add_pd ( _mm_mul_pd ( v_dst1 , v_beta ) , _mm_mul_pd ( v_src1 , v_alpha ) ) ;
v_dst2 = _mm_add_pd ( _mm_mul_pd ( v_dst2 , v_beta ) , _mm_mul_pd ( v_src2 , v_alpha ) ) ;
v_dst3 = _mm_add_pd ( _mm_mul_pd ( v_dst3 , v_beta ) , _mm_mul_pd ( v_src3 , v_alpha ) ) ;
_mm_storeu_pd ( dst + x , v_dst0 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst1 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst2 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst3 ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_int , dummy ;
v_expand ( v_src , v_int , dummy ) ;
v_uint32x4 v_int_0 , v_int_1 ;
v_expand ( v_int , v_int_0 , v_int_1 ) ;
v_int32x4 v_int0 = v_reinterpret_as_s32 ( v_int_0 ) ;
v_int32x4 v_int1 = v_reinterpret_as_s32 ( v_int_1 ) ;
v_float64x2 v_src0 = v_cvt_f64 ( v_int0 ) ;
v_float64x2 v_src1 = v_cvt_f64_high ( v_int0 ) ;
v_float64x2 v_src2 = v_cvt_f64 ( v_int1 ) ;
v_float64x2 v_src3 = v_cvt_f64_high ( v_int1 ) ;
v_float64x2 v_dst0 = v_load ( dst + x ) ;
v_float64x2 v_dst1 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst2 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst3 = v_load ( dst + x + 6 ) ;
v_dst0 = ( v_dst0 * v_beta ) + ( v_src0 * v_alpha ) ;
v_dst1 = ( v_dst1 * v_beta ) + ( v_src1 * v_alpha ) ;
v_dst2 = ( v_dst2 * v_beta ) + ( v_src2 * v_alpha ) ;
v_dst3 = ( v_dst3 * v_beta ) + ( v_src3 * v_alpha ) ;
v_store ( dst + x , v_dst0 ) ;
v_store ( dst + x + 2 , v_dst1 ) ;
v_store ( dst + x + 4 , v_dst2 ) ;
v_store ( dst + x + 6 , v_dst3 ) ;
}
}
@ -1451,44 +1448,47 @@ struct AccW_SIMD<ushort, double>
int operator ( ) ( const ushort * src , double * dst , const uchar * mask , int len , int cn , double alpha ) const
{
int x = 0 ;
__m128d v_alpha = _mm_set1_pd ( alpha ) ;
__m128d v_beta = _mm_set1_pd ( 1.0f - alpha ) ;
__m128i v_0 = _mm_setzero_si128 ( ) ;
v_float64x2 v_alpha = v_setall_f64 ( alpha ) ;
v_float64x2 v_beta = v_setall_f64 ( 1.0f - alpha ) ;
if ( ! mask )
{
len * = cn ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i v_src = _mm_loadu_si128 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v_int0 = _mm_unpacklo_epi16 ( v_src , v_0 ) ;
__m128i v_int1 = _mm_unpackhi_epi16 ( v_src , v_0 ) ;
__m128d v_src00 = _mm_cvtepi32_pd ( v_int0 ) ;
__m128d v_src01 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int0 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_src10 = _mm_cvtepi32_pd ( v_int1 ) ;
__m128d v_src11 = _mm_cvtepi32_pd ( _mm_shuffle_epi32 ( v_int1 , _MM_SHUFFLE ( 0 , 0 , 3 , 2 ) ) ) ;
__m128d v_dst00 = _mm_loadu_pd ( dst + x ) ;
__m128d v_dst01 = _mm_loadu_pd ( dst + x + 2 ) ;
__m128d v_dst10 = _mm_loadu_pd ( dst + x + 4 ) ;
__m128d v_dst11 = _mm_loadu_pd ( dst + x + 6 ) ;
v_dst00 = _mm_add_pd ( _mm_mul_pd ( v_dst00 , v_beta ) , _mm_mul_pd ( v_src00 , v_alpha ) ) ;
v_dst01 = _mm_add_pd ( _mm_mul_pd ( v_dst01 , v_beta ) , _mm_mul_pd ( v_src01 , v_alpha ) ) ;
v_dst10 = _mm_add_pd ( _mm_mul_pd ( v_dst10 , v_beta ) , _mm_mul_pd ( v_src10 , v_alpha ) ) ;
v_dst11 = _mm_add_pd ( _mm_mul_pd ( v_dst11 , v_beta ) , _mm_mul_pd ( v_src11 , v_alpha ) ) ;
_mm_storeu_pd ( dst + x , v_dst00 ) ;
_mm_storeu_pd ( dst + x + 2 , v_dst01 ) ;
_mm_storeu_pd ( dst + x + 4 , v_dst10 ) ;
_mm_storeu_pd ( dst + x + 6 , v_dst11 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_uint32x4 v_int_0 , v_int_1 ;
v_expand ( v_src , v_int_0 , v_int_1 ) ;
v_int32x4 v_int0 = v_reinterpret_as_s32 ( v_int_0 ) ;
v_int32x4 v_int1 = v_reinterpret_as_s32 ( v_int_1 ) ;
v_float64x2 v_src00 = v_cvt_f64 ( v_int0 ) ;
v_float64x2 v_src01 = v_cvt_f64_high ( v_int0 ) ;
v_float64x2 v_src10 = v_cvt_f64 ( v_int1 ) ;
v_float64x2 v_src11 = v_cvt_f64_high ( v_int1 ) ;
v_float64x2 v_dst00 = v_load ( dst + x ) ;
v_float64x2 v_dst01 = v_load ( dst + x + 2 ) ;
v_float64x2 v_dst10 = v_load ( dst + x + 4 ) ;
v_float64x2 v_dst11 = v_load ( dst + x + 6 ) ;
v_dst00 = ( v_dst00 * v_beta ) + ( v_src00 * v_alpha ) ;
v_dst01 = ( v_dst01 * v_beta ) + ( v_src01 * v_alpha ) ;
v_dst10 = ( v_dst10 * v_beta ) + ( v_src10 * v_alpha ) ;
v_dst11 = ( v_dst11 * v_beta ) + ( v_src11 * v_alpha ) ;
v_store ( dst + x , v_dst00 ) ;
v_store ( dst + x + 2 , v_dst01 ) ;
v_store ( dst + x + 4 , v_dst10 ) ;
v_store ( dst + x + 6 , v_dst11 ) ;
}
}
return x ;
}
} ;
# endif //CV_SSE2
# endif //CV_SIMD128_64F
# endif //CV_SIMD128
template < typename T , typename AT > void