@ -38,8 +38,10 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
# include "precomp.hpp"
# include "opencl_kernels_imgproc.hpp"
# include "opencv2/core/hal/intrin.hpp"
namespace cv
{
@ -211,7 +213,7 @@ struct MomentsInTile_SIMD
}
} ;
# if CV_SSE2
# if CV_SIMD128
template < >
struct MomentsInTile_SIMD < uchar , int , int >
@ -226,115 +228,33 @@ struct MomentsInTile_SIMD<uchar, int, int>
int x = 0 ;
{
__m128i dx = _mm_set1_epi16 ( 8 ) ;
__m128i z = _mm_setzero_si128 ( ) , qx0 = z , qx1 = z , qx2 = z , qx3 = z , qx = _mm_setr_epi16 ( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ) ;
v_int16x8 dx = v_setall_s16 ( 8 ) , qx = v_int16x8 ( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ) ;
v_uint32x4 z = v_setzero_u32 ( ) , qx0 = z , qx1 = z , qx2 = z , qx3 = z ;
for ( ; x < = len - 8 ; x + = 8 )
{
__m128i p = _mm_unpacklo_epi8 ( _mm_loadl_epi64 ( ( const __m128i * ) ( ptr + x ) ) , z ) ;
__m128i sx = _mm_mullo_epi16 ( qx , qx ) ;
v_int16x8 p = v_reinterpret_as_s16 ( v_load_expand ( ptr + x ) ) ;
v_int16x8 sx = v_mul_wrap ( qx , qx ) ;
qx0 = _mm_add_epi16 ( qx0 , p ) ;
qx1 = _mm_add_epi32 ( qx1 , _mm_madd_epi16 ( p , qx ) ) ;
qx2 = _mm_add_epi32 ( qx2 , _mm_madd_epi16 ( p , sx ) ) ;
qx3 = _mm_add_epi32 ( qx3 , _mm_madd_epi16 ( _mm_mullo_epi16 ( p , qx ) , sx ) ) ;
qx0 + = v_reinterpret_as_u32 ( p ) ;
qx1 = v_reinterpret_as_u32 ( v_dotprod ( p , qx , v_reinterpret_as_s32 ( qx1 ) ) ) ;
qx2 = v_reinterpret_as_u32 ( v_dotprod ( p , sx , v_reinterpret_as_s32 ( qx2 ) ) ) ;
qx3 = v_reinterpret_as_u32 ( v_dotprod ( v_mul_wrap ( p , qx ) , sx , v_reinterpret_as_s32 ( qx3 ) ) ) ;
qx = _mm_add_epi16 ( qx , dx ) ;
qx + = dx ;
}
__m128i qx01_lo = _mm_unpacklo_epi32 ( qx0 , qx1 ) ;
__m128i qx23_lo = _mm_unpacklo_epi32 ( qx2 , qx3 ) ;
__m128i qx01_hi = _mm_unpackhi_epi32 ( qx0 , qx1 ) ;
__m128i qx23_hi = _mm_unpackhi_epi32 ( qx2 , qx3 ) ;
qx01_lo = _mm_add_epi32 ( qx01_lo , qx01_hi ) ;
qx23_lo = _mm_add_epi32 ( qx23_lo , qx23_hi ) ;
__m128i qx0123_lo = _mm_unpacklo_epi64 ( qx01_lo , qx23_lo ) ;
__m128i qx0123_hi = _mm_unpackhi_epi64 ( qx01_lo , qx23_lo ) ;
qx0123_lo = _mm_add_epi32 ( qx0123_lo , qx0123_hi ) ;
_mm_store_si128 ( ( __m128i * ) buf , qx0123_lo ) ;
x0 = ( buf [ 0 ] & 0xffff ) + ( buf [ 0 ] > > 16 ) ;
x1 = buf [ 1 ] ;
x2 = buf [ 2 ] ;
x3 = buf [ 3 ] ;
x0 = v_reduce_sum ( qx0 ) ;
x0 = ( x0 & 0xffff ) + ( x0 > > 16 ) ;
x1 = v_reduce_sum ( qx1 ) ;
x2 = v_reduce_sum ( qx2 ) ;
x3 = v_reduce_sum ( qx3 ) ;
}
return x ;
}
int CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
} ;
# elif CV_NEON
template < >
struct MomentsInTile_SIMD < uchar , int , int >
{
MomentsInTile_SIMD ( )
{
ushort CV_DECL_ALIGNED ( 8 ) init [ 4 ] = { 0 , 1 , 2 , 3 } ;
qx_init = vld1_u16 ( init ) ;
v_step = vdup_n_u16 ( 4 ) ;
}
int operator ( ) ( const uchar * ptr , int len , int & x0 , int & x1 , int & x2 , int & x3 )
{
int x = 0 ;
uint32x4_t v_z = vdupq_n_u32 ( 0 ) , v_x0 = v_z , v_x1 = v_z ,
v_x2 = v_z , v_x3 = v_z ;
uint16x4_t qx = qx_init ;
for ( ; x < = len - 8 ; x + = 8 )
{
uint16x8_t v_src = vmovl_u8 ( vld1_u8 ( ptr + x ) ) ;
// first part
uint32x4_t v_qx = vmovl_u16 ( qx ) ;
uint16x4_t v_p = vget_low_u16 ( v_src ) ;
uint32x4_t v_px = vmull_u16 ( qx , v_p ) ;
v_x0 = vaddw_u16 ( v_x0 , v_p ) ;
v_x1 = vaddq_u32 ( v_x1 , v_px ) ;
v_px = vmulq_u32 ( v_px , v_qx ) ;
v_x2 = vaddq_u32 ( v_x2 , v_px ) ;
v_x3 = vaddq_u32 ( v_x3 , vmulq_u32 ( v_px , v_qx ) ) ;
qx = vadd_u16 ( qx , v_step ) ;
// second part
v_qx = vmovl_u16 ( qx ) ;
v_p = vget_high_u16 ( v_src ) ;
v_px = vmull_u16 ( qx , v_p ) ;
v_x0 = vaddw_u16 ( v_x0 , v_p ) ;
v_x1 = vaddq_u32 ( v_x1 , v_px ) ;
v_px = vmulq_u32 ( v_px , v_qx ) ;
v_x2 = vaddq_u32 ( v_x2 , v_px ) ;
v_x3 = vaddq_u32 ( v_x3 , vmulq_u32 ( v_px , v_qx ) ) ;
qx = vadd_u16 ( qx , v_step ) ;
}
vst1q_u32 ( buf , v_x0 ) ;
x0 = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
vst1q_u32 ( buf , v_x1 ) ;
x1 = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
vst1q_u32 ( buf , v_x2 ) ;
x2 = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
vst1q_u32 ( buf , v_x3 ) ;
x3 = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
return x ;
}
uint CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
uint16x4_t qx_init , v_step ;
} ;
# endif
# if CV_SSE4_1
template < >
struct MomentsInTile_SIMD < ushort , int , int64 >
{
@ -348,49 +268,39 @@ struct MomentsInTile_SIMD<ushort, int, int64>
int x = 0 ;
{
__m128i v_delta = _mm_set1_epi32 ( 4 ) , v_zero = _mm_setzero_si128 ( ) , v_x0 = v_zero ,
v_x1 = v_zero , v_x2 = v_zero , v_x3 = v_zero , v_ix0 = _mm_setr_epi32 ( 0 , 1 , 2 , 3 ) ;
v_int32x4 v_delta = v_setall_s32 ( 4 ) , v_ix0 = v_int32x4 ( 0 , 1 , 2 , 3 ) ;
v_uint32x4 z = v_setzero_u32 ( ) , v_x0 = z , v_x1 = z , v_x2 = z ;
v_uint64x2 v_x3 = v_reinterpret_as_u64 ( z ) ;
for ( ; x < = len - 4 ; x + = 4 )
{
__m128i v_src = _mm_loadl_epi64 ( ( const __m128i * ) ( ptr + x ) ) ;
v_src = _mm_unpacklo_epi16 ( v_src , v_zero ) ;
v_int32x4 v_src = v_reinterpret_as_s32 ( v_load_expand ( ptr + x ) ) ;
v_x0 = _mm_add_epi32 ( v_x0 , v_src ) ;
v_x1 = _mm_add_epi32 ( v_x1 , _mm_mullo_epi32 ( v_src , v_ix0 ) ) ;
v_x0 + = v_reinterpret_as_u32 ( v_src ) ;
v_x1 + = v_reinterpret_as_u32 ( v_src * v_ix0 ) ;
__m128i v_ix1 = _mm_mullo_epi32 ( v_ix0 , v_ix0 ) ;
v_x2 = _mm_add_epi32 ( v_x2 , _mm_mullo_epi32 ( v_src , v_ix1 ) ) ;
v_int32x4 v_ix1 = v_ix0 * v_ix0 ;
v_x2 + = v_reinterpret_as_u32 ( v_src * v_ix1 ) ;
v_ix1 = _mm_mullo_epi32 ( v_ix0 , v_ix1 ) ;
v_src = _mm_mullo_epi32 ( v_src , v_ix1 ) ;
v_x3 = _mm_add_epi64 ( v_x3 , _mm_add_epi64 ( _mm_unpacklo_epi32 ( v_src , v_zero ) , _mm_unpackhi_epi32 ( v_src , v_zero ) ) ) ;
v_ix1 = v_ix0 * v_ix1 ;
v_src = v_src * v_ix1 ;
v_uint64x2 v_lo , v_hi ;
v_expand ( v_reinterpret_as_u32 ( v_src ) , v_lo , v_hi ) ;
v_x3 + = v_lo + v_hi ;
v_ix0 = _mm_add_epi32 ( v_ix0 , v_delta ) ;
v_ix0 + = v_delta ;
}
__m128i v_x01_lo = _mm_unpacklo_epi32 ( v_x0 , v_x1 ) ;
__m128i v_x22_lo = _mm_unpacklo_epi32 ( v_x2 , v_x2 ) ;
__m128i v_x01_hi = _mm_unpackhi_epi32 ( v_x0 , v_x1 ) ;
__m128i v_x22_hi = _mm_unpackhi_epi32 ( v_x2 , v_x2 ) ;
v_x01_lo = _mm_add_epi32 ( v_x01_lo , v_x01_hi ) ;
v_x22_lo = _mm_add_epi32 ( v_x22_lo , v_x22_hi ) ;
__m128i v_x0122_lo = _mm_unpacklo_epi64 ( v_x01_lo , v_x22_lo ) ;
__m128i v_x0122_hi = _mm_unpackhi_epi64 ( v_x01_lo , v_x22_lo ) ;
v_x0122_lo = _mm_add_epi32 ( v_x0122_lo , v_x0122_hi ) ;
_mm_store_si128 ( ( __m128i * ) buf64 , v_x3 ) ;
_mm_store_si128 ( ( __m128i * ) buf , v_x0122_lo ) ;
x0 = buf [ 0 ] ;
x1 = buf [ 1 ] ;
x2 = buf [ 2 ] ;
x0 = v_reduce_sum ( v_x0 ) ;
x1 = v_reduce_sum ( v_x1 ) ;
x2 = v_reduce_sum ( v_x2 ) ;
v_store_aligned ( buf64 , v_reinterpret_as_s64 ( v_x3 ) ) ;
x3 = buf64 [ 0 ] + buf64 [ 1 ] ;
}
return x ;
}
int CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
int64 CV_DECL_ALIGNED ( 16 ) buf64 [ 2 ] ;
} ;