@ -425,9 +425,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
v_uint16x8 v_0 = v_setall_u16 ( 0 ) ;
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x16 _v_mask = v_load ( mask + x ) ;
v_uint16x8 v_mask , dummy ;
v_expand ( _v_mask , v_mask , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_src = v_src & v_mask ;
@ -443,9 +441,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
v_uint16x8 v_0 = v_setall_u16 ( 0 ) ;
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x16 _v_mask = v_load ( mask + x ) ;
v_uint16x8 v_mask , dummy ;
v_expand ( _v_mask , v_mask , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint16x8 v_src0 , v_src1 , v_src2 ;
v_load_deinterleave ( src + x * cn , v_src0 , v_src1 , v_src2 ) ;
@ -491,8 +487,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint16x8 v_masku16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_masku320 , v_masku321 ;
v_expand ( v_masku16 , v_masku320 , v_masku321 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_masku320 = = v_reinterpret_as_u32 ( v_0 ) ) ) ;
@ -506,8 +501,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint16x8 v_masku16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_masku320 , v_masku321 ;
v_expand ( v_masku16 , v_masku320 , v_masku321 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_masku320 = = v_reinterpret_as_u32 ( v_0 ) ) ) ;
@ -770,8 +764,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_mask , dummy ;
v_expand ( v_load ( mask + x ) , v_mask , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
v_src = v_src & v_mask ;
@ -803,8 +796,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_mask , dummy ;
v_expand ( v_load ( mask + x ) , v_mask , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint16x8 v_src0 , v_src1 , v_src2 ;
v_load_deinterleave ( src + x * cn , v_src0 , v_src1 , v_src2 ) ;
@ -871,10 +863,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint32x4 v_masku32 , dummy1 ;
v_expand ( v_masku16 , v_masku32 , dummy1 ) ;
v_uint32x4 v_masku32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( v_masku32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
@ -892,10 +881,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint32x4 v_masku32 , dummy1 ;
v_expand ( v_masku16 , v_masku32 , dummy1 ) ;
v_uint32x4 v_masku32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( v_masku32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
@ -947,10 +933,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint32x4 v_masku32 , dummy1 ;
v_expand ( v_masku16 , v_masku32 , dummy1 ) ;
v_uint32x4 v_masku32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( v_masku32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
@ -967,10 +950,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_masku16 , dummy0 ;
v_expand ( v_load ( mask + x ) , v_masku16 , dummy0 ) ;
v_uint32x4 v_masku32 , dummy1 ;
v_expand ( v_masku16 , v_masku32 , dummy1 ) ;
v_uint32x4 v_masku32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( v_masku32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
@ -1157,9 +1137,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint16x8 v_mask16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask0 , v_mask1 ;
v_expand ( stub , v_mask0 , v_mask1 ) ;
v_expand ( v_mask16 , v_mask0 , v_mask1 ) ;
v_mask0 = ~ ( v_mask0 = = v_0 ) ;
v_mask1 = ~ ( v_mask1 = = v_0 ) ;
v_uint16x8 v_src = v_load ( src + x ) ;
@ -1182,9 +1162,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint16x8 v_mask16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask0 , v_mask1 ;
v_expand ( stub , v_mask0 , v_mask1 ) ;
v_expand ( v_mask16 , v_mask0 , v_mask1 ) ;
v_mask0 = ~ ( v_mask0 = = v_0 ) ;
v_mask1 = ~ ( v_mask1 = = v_0 ) ;
@ -1254,11 +1234,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_stub0 , v_stub 1 ;
v_expand ( v_stub , v_stub0 , v_stub 1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_stub 0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_stub 1 = = v_0 ) ) ;
v_uint16x8 v_mask16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask_0 , v_mask_ 1 ;
v_expand ( v_mask16 , v_mask_0 , v_mask_ 1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_mask_ 0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_mask_ 1 = = v_0 ) ) ;
v_float32x4 v_src0 = v_load ( src + x ) ;
v_float32x4 v_src1 = v_load ( src + x + 4 ) ;
v_src0 = v_src0 & v_mask0 ;
@ -1274,11 +1254,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 v_stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_stub0 , v_stub 1 ;
v_expand ( v_stub , v_stub0 , v_stub 1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_stub 0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_stub 1 = = v_0 ) ) ;
v_uint16x8 v_mask16 = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask_0 , v_mask_ 1 ;
v_expand ( v_mask16 , v_mask_0 , v_mask_ 1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_mask_ 0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_mask_ 1 = = v_0 ) ) ;
v_float32x4 v_src00 , v_src10 , v_src20 , v_src01 , v_src11 , v_src21 ;
v_load_deinterleave ( src + x * cn , v_src00 , v_src10 , v_src20 ) ;
@ -1319,9 +1299,7 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
int size = len * cn ;
for ( ; x < = size - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_int , dummy ;
v_expand ( v_src , v_int , dummy ) ;
v_uint16x8 v_int = v_load_expand ( src + x ) ;
v_uint32x4 v_int0 , v_int1 ;
v_expand ( v_int , v_int0 , v_int1 ) ;
@ -1353,17 +1331,15 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
}
else
{
v_uint8x 16 v_0 = v_setzero_u8 ( ) ;
v_uint16x8 v_0 = v_setzero_u16 ( ) ;
if ( cn = = 1 )
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x 16 v_mask = v_load ( mask + x ) ;
v_uint16x8 v_mask = v_load_expan d ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint8x16 v_src = v_load ( src + x ) ;
v_src = v_src & v_mask ;
v_uint16x8 v_int , dummy ;
v_expand ( v_src , v_int , dummy ) ;
v_uint16x8 v_src = v_load_expand ( src + x ) ;
v_uint16x8 v_int = v_src & v_mask ;
v_uint32x4 v_int0 , v_int1 ;
v_expand ( v_int , v_int0 , v_int1 ) ;
@ -1395,19 +1371,19 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
}
else if ( cn = = 3 )
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
for ( ; x < = len - /*cVectorWidth*/ 16 ; x + = cVectorWidth )
{
v_uint8x16 v_mask = v_load ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint8x16 v_src0 , v_src1 , v_src2 ;
v_load_deinterleave ( src + x * cn , v_src0 , v_src1 , v_src2 ) ;
v_src0 = v_src0 & v_mask ;
v_src1 = v_src1 & v_mask ;
v_src2 = v_src2 & v_mask ;
v_uint16x8 v_int0 , v_int1 , v_int2 , dummy ;
v_expand ( v_src0 , v_int0 , dummy ) ;
v_expand ( v_src1 , v_int1 , dummy ) ;
v_expand ( v_src2 , v_int2 , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_int0 = v_int0 & v_mask ;
v_int1 = v_int1 & v_mask ;
v_int2 = v_int2 & v_mask ;
v_uint32x4 v_int00 , v_int01 , v_int10 , v_int11 , v_int20 , v_int21 ;
v_expand ( v_int0 , v_int00 , v_int01 ) ;
@ -1627,9 +1603,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask , dummy ;
v_expand ( stub , v_mask , dummy ) ;
v_uint32x4 v_mask = v_load_expand_q ( mask + x ) ; ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_float32x4 v_src = v_load ( src + x ) ;
v_src = v_src & v_reinterpret_as_f32 ( v_mask ) ;
@ -1646,9 +1620,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask , dummy ;
v_expand ( stub , v_mask , dummy ) ;
v_uint32x4 v_mask = v_load_expand_q ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_float32x4 v_src0 , v_src1 , v_src2 ;
@ -1709,11 +1681,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_uint32x4 v_mask32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( stub0 , v_masku640 , v_masku641 ) ;
v_expand ( v_mask32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
v_float64x2 v_mask1 = v_reinterpret_as_f64 ( ~ ( v_masku641 = = v_0 ) ) ;
v_float64x2 v_src0 = v_load ( src + x ) ;
@ -1731,11 +1701,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_uint32x4 v_mask32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( stub0 , v_masku640 , v_masku641 ) ;
v_expand ( v_mask32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
v_float64x2 v_mask1 = v_reinterpret_as_f64 ( ~ ( v_masku641 = = v_0 ) ) ;
@ -2059,11 +2027,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( stub0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( stub1 = = v_0 ) ) ;
v_uint32x4 v_mask32_0 = v_load_expand_q ( mask + x ) ;
v_uint32x4 v_mask32_1 = v_load_expand_q ( mask + x + 4 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_mask32_0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_mask32_1 = = v_0 ) ) ;
v_store ( dst + x , v_load ( dst + x ) + ( ( v_load ( src1 + x ) * v_load ( src2 + x ) ) & v_mask0 ) ) ;
v_store ( dst + x + 4 , v_load ( dst + x + 4 ) + ( ( v_load ( src1 + x + 4 ) * v_load ( src2 + x + 4 ) ) & v_mask1 ) ) ;
@ -2073,11 +2040,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( stub0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( stub1 = = v_0 ) ) ;
v_uint32x4 v_mask32_0 = v_load_expand_q ( mask + x ) ;
v_uint32x4 v_mask32_1 = v_load_expand_q ( mask + x + 4 ) ;
v_float32x4 v_mask0 = v_reinterpret_as_f32 ( ~ ( v_mask32_0 = = v_0 ) ) ;
v_float32x4 v_mask1 = v_reinterpret_as_f32 ( ~ ( v_mask32_1 = = v_0 ) ) ;
v_float32x4 v_1src00 , v_1src01 , v_1src10 , v_1src11 , v_1src20 , v_1src21 ;
v_float32x4 v_2src00 , v_2src01 , v_2src10 , v_2src11 , v_2src20 , v_2src21 ;
@ -2109,12 +2075,8 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
int size = len * cn ;
for ( ; x < = size - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x16 v_1src = v_load ( src1 + x ) ;
v_uint8x16 v_2src = v_load ( src2 + x ) ;
v_uint16x8 v_1int , v_2int , dummy ;
v_expand ( v_1src , v_1int , dummy ) ;
v_expand ( v_2src , v_2int , dummy ) ;
v_uint16x8 v_1int = v_load_expand ( src1 + x ) ;
v_uint16x8 v_2int = v_load_expand ( src2 + x ) ;
v_uint32x4 v_1int_0 , v_1int_1 , v_2int_0 , v_2int_1 ;
v_expand ( v_1int , v_1int_0 , v_1int_1 ) ;
@ -2148,19 +2110,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
}
else
{
v_uint8x 16 v_0 = v_setzero_u8 ( ) ;
v_uint16x8 v_0 = v_setzero_u16 ( ) ;
if ( cn = = 1 )
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x 16 v_mask = v_load ( mask + x ) ;
v_uint16x8 v_mask = v_load_expan d ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint8x16 v_1src = v_load ( src1 + x ) & v_mask ;
v_uint8x16 v_2src = v_load ( src2 + x ) & v_mask ;
v_uint16x8 v_1int , v_2int , dummy ;
v_expand ( v_1src , v_1int , dummy ) ;
v_expand ( v_2src , v_2int , dummy ) ;
v_uint16x8 v_1int = v_load_expand ( src1 + x ) & v_mask ;
v_uint16x8 v_2int = v_load_expand ( src2 + x ) & v_mask ;
v_uint32x4 v_1int_0 , v_1int_1 , v_2int_0 , v_2int_1 ;
v_expand ( v_1int , v_1int_0 , v_1int_1 ) ;
@ -2194,19 +2152,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
}
else if ( cn = = 3 )
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
for ( ; x < = len - /*cVectorWidth*/ 16 ; x + = cVectorWidth )
{
v_uint8x16 v_mask = v_load ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_uint8x16 v_1src0 , v_1src1 , v_1src2 , v_2src0 , v_2src1 , v_2src2 ;
v_load_deinterleave ( src1 + x * cn , v_1src0 , v_1src1 , v_1src2 ) ;
v_load_deinterleave ( src2 + x * cn , v_2src0 , v_2src1 , v_2src2 ) ;
v_1src0 = v_1src0 & v_mask ;
v_1src1 = v_1src1 & v_mask ;
v_1src2 = v_1src2 & v_mask ;
v_2src0 = v_2src0 & v_mask ;
v_2src1 = v_2src1 & v_mask ;
v_2src2 = v_2src2 & v_mask ;
v_uint16x8 v_1int0 , v_1int1 , v_1int2 , v_2int0 , v_2int1 , v_2int2 , dummy ;
v_expand ( v_1src0 , v_1int0 , dummy ) ;
@ -2216,6 +2166,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
v_expand ( v_2src1 , v_2int1 , dummy ) ;
v_expand ( v_2src2 , v_2int2 , dummy ) ;
v_uint16x8 v_mask = v_load_expand ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_1int0 = v_1int0 & v_mask ;
v_1int1 = v_1int1 & v_mask ;
v_1int2 = v_1int2 & v_mask ;
v_2int0 = v_2int0 & v_mask ;
v_2int1 = v_2int1 & v_mask ;
v_2int2 = v_2int2 & v_mask ;
v_uint32x4 v_1int00 , v_1int01 , v_1int10 , v_1int11 , v_1int20 , v_1int21 ;
v_uint32x4 v_2int00 , v_2int01 , v_2int10 , v_2int11 , v_2int20 , v_2int21 ;
v_expand ( v_1int0 , v_1int00 , v_1int01 ) ;
@ -2440,9 +2399,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask , dummy ;
v_expand ( stub , v_mask , dummy ) ;
v_uint32x4 v_mask = v_load_expand_q ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_float32x4 v_1src = v_load ( src1 + x ) ;
v_float32x4 v_2src = v_load ( src2 + x ) ;
@ -2462,9 +2419,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 v_mask , dummy ;
v_expand ( stub , v_mask , dummy ) ;
v_uint32x4 v_mask = v_load_expand_q ( mask + x ) ;
v_mask = ~ ( v_mask = = v_0 ) ;
v_float32x4 v_1src0 , v_1src1 , v_1src2 , v_2src0 , v_2src1 , v_2src2 ;
v_load_deinterleave ( src1 + x * cn , v_1src0 , v_1src1 , v_1src2 ) ;
@ -2522,11 +2477,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_uint32x4 v_mask32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( stub0 , v_masku640 , v_masku641 ) ;
v_expand ( v_mask32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
v_float64x2 v_mask1 = v_reinterpret_as_f64 ( ~ ( v_masku641 = = v_0 ) ) ;
@ -2543,11 +2496,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
{
for ( ; x < = len - cVectorWidth ; x + = cVectorWidth )
{
v_uint16x8 stub = v_load_expand ( mask + x ) ;
v_uint32x4 stub0 , stub1 ;
v_expand ( stub , stub0 , stub1 ) ;
v_uint32x4 v_mask32 = v_load_expand_q ( mask + x ) ;
v_uint64x2 v_masku640 , v_masku641 ;
v_expand ( stub0 , v_masku640 , v_masku641 ) ;
v_expand ( v_mask32 , v_masku640 , v_masku641 ) ;
v_float64x2 v_mask0 = v_reinterpret_as_f64 ( ~ ( v_masku640 = = v_0 ) ) ;
v_float64x2 v_mask1 = v_reinterpret_as_f64 ( ~ ( v_masku641 = = v_0 ) ) ;
@ -2704,12 +2655,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
int size = len * cn ;
for ( ; x < = size - cVectorWidth ; x + = cVectorWidth )
{
v_uint8x16 v_src = v_load ( src + x ) ;
v_uint16x8 v_int , dummy ;
v_expand ( v_src , v_int , dummy ) ;
v_uint16x8 v_src16 = v_load_expand ( src + x ) ;
v_uint32x4 v_int_0 , v_int_1 ;
v_expand ( v_int , v_int_0 , v_int_1 ) ;
v_expand ( v_src16 , v_int_0 , v_int_1 ) ;
v_int32x4 v_int0 = v_reinterpret_as_s32 ( v_int_0 ) ;
v_int32x4 v_int1 = v_reinterpret_as_s32 ( v_int_1 ) ;