|
|
@ -606,17 +606,15 @@ void polarToCart( InputArray src1, InputArray src2, |
|
|
|
{ |
|
|
|
{ |
|
|
|
k = 0; |
|
|
|
k = 0; |
|
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 |
|
|
|
#if CV_SIMD |
|
|
|
if( hasSIMD128() ) |
|
|
|
int cWidth = v_float32::nlanes; |
|
|
|
{ |
|
|
|
|
|
|
|
int cWidth = v_float32x4::nlanes; |
|
|
|
|
|
|
|
for( ; k <= len - cWidth; k += cWidth ) |
|
|
|
for( ; k <= len - cWidth; k += cWidth ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32x4 v_m = v_load(mag + k); |
|
|
|
v_float32 v_m = vx_load(mag + k); |
|
|
|
v_store(x + k, v_load(x + k) * v_m); |
|
|
|
v_store(x + k, vx_load(x + k) * v_m); |
|
|
|
v_store(y + k, v_load(y + k) * v_m); |
|
|
|
v_store(y + k, vx_load(y + k) * v_m); |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
for( ; k < len; k++ ) |
|
|
|
for( ; k < len; k++ ) |
|
|
@ -735,7 +733,7 @@ struct iPow_SIMD |
|
|
|
} |
|
|
|
} |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 |
|
|
|
#if CV_SIMD |
|
|
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
template <> |
|
|
|
struct iPow_SIMD<uchar, int> |
|
|
|
struct iPow_SIMD<uchar, int> |
|
|
@ -743,13 +741,13 @@ struct iPow_SIMD<uchar, int> |
|
|
|
int operator() ( const uchar * src, uchar * dst, int len, int power ) |
|
|
|
int operator() ( const uchar * src, uchar * dst, int len, int power ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_uint32x4 v_1 = v_setall_u32(1u); |
|
|
|
v_uint32 v_1 = vx_setall_u32(1u); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_uint32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_uint16x8 v = v_load_expand(src + i); |
|
|
|
v_uint16 v = vx_load_expand(src + i); |
|
|
|
v_uint32x4 v_b1, v_b2; |
|
|
|
v_uint32 v_b1, v_b2; |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
int p = power; |
|
|
|
int p = power; |
|
|
|
|
|
|
|
|
|
|
@ -771,6 +769,7 @@ struct iPow_SIMD<uchar, int> |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v_pack_store(dst + i, v); |
|
|
|
v_pack_store(dst + i, v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -782,13 +781,13 @@ struct iPow_SIMD<schar, int> |
|
|
|
int operator() ( const schar * src, schar * dst, int len, int power) |
|
|
|
int operator() ( const schar * src, schar * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_int32x4 v_1 = v_setall_s32(1); |
|
|
|
v_int32 v_1 = vx_setall_s32(1); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int16x8 v = v_load_expand(src + i); |
|
|
|
v_int16 v = vx_load_expand(src + i); |
|
|
|
v_int32x4 v_b1, v_b2; |
|
|
|
v_int32 v_b1, v_b2; |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
int p = power; |
|
|
|
int p = power; |
|
|
|
|
|
|
|
|
|
|
@ -810,6 +809,7 @@ struct iPow_SIMD<schar, int> |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v_pack_store(dst + i, v); |
|
|
|
v_pack_store(dst + i, v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -821,13 +821,13 @@ struct iPow_SIMD<ushort, int> |
|
|
|
int operator() ( const ushort * src, ushort * dst, int len, int power) |
|
|
|
int operator() ( const ushort * src, ushort * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_uint32x4 v_1 = v_setall_u32(1u); |
|
|
|
v_uint32 v_1 = vx_setall_u32(1u); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_uint32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_uint16x8 v = v_load(src + i); |
|
|
|
v_uint16 v = vx_load(src + i); |
|
|
|
v_uint32x4 v_b1, v_b2; |
|
|
|
v_uint32 v_b1, v_b2; |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
int p = power; |
|
|
|
int p = power; |
|
|
|
|
|
|
|
|
|
|
@ -849,6 +849,7 @@ struct iPow_SIMD<ushort, int> |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v_store(dst + i, v); |
|
|
|
v_store(dst + i, v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -860,13 +861,13 @@ struct iPow_SIMD<short, int> |
|
|
|
int operator() ( const short * src, short * dst, int len, int power) |
|
|
|
int operator() ( const short * src, short * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_int32x4 v_1 = v_setall_s32(1); |
|
|
|
v_int32 v_1 = vx_setall_s32(1); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int16x8 v = v_load(src + i); |
|
|
|
v_int16 v = vx_load(src + i); |
|
|
|
v_int32x4 v_b1, v_b2; |
|
|
|
v_int32 v_b1, v_b2; |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
v_expand(v, v_b1, v_b2); |
|
|
|
int p = power; |
|
|
|
int p = power; |
|
|
|
|
|
|
|
|
|
|
@ -888,6 +889,7 @@ struct iPow_SIMD<short, int> |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v = v_pack(v_a1, v_a2); |
|
|
|
v_store(dst + i, v); |
|
|
|
v_store(dst + i, v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -899,12 +901,12 @@ struct iPow_SIMD<int, int> |
|
|
|
int operator() ( const int * src, int * dst, int len, int power) |
|
|
|
int operator() ( const int * src, int * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_int32x4 v_1 = v_setall_s32(1); |
|
|
|
v_int32 v_1 = vx_setall_s32(1); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); |
|
|
|
v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes); |
|
|
|
int p = power; |
|
|
|
int p = power; |
|
|
|
|
|
|
|
|
|
|
|
while( p > 1 ) |
|
|
|
while( p > 1 ) |
|
|
@ -923,8 +925,9 @@ struct iPow_SIMD<int, int> |
|
|
|
v_a2 *= v_b2; |
|
|
|
v_a2 *= v_b2; |
|
|
|
|
|
|
|
|
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i + 4, v_a2); |
|
|
|
v_store(dst + i + v_int32::nlanes, v_a2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -936,12 +939,12 @@ struct iPow_SIMD<float, float> |
|
|
|
int operator() ( const float * src, float * dst, int len, int power) |
|
|
|
int operator() ( const float * src, float * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_float32x4 v_1 = v_setall_f32(1.f); |
|
|
|
v_float32 v_1 = vx_setall_f32(1.f); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 8; i += 8) |
|
|
|
for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32x4 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_float32 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); |
|
|
|
v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes); |
|
|
|
int p = std::abs(power); |
|
|
|
int p = std::abs(power); |
|
|
|
if( power < 0 ) |
|
|
|
if( power < 0 ) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -965,26 +968,27 @@ struct iPow_SIMD<float, float> |
|
|
|
v_a2 *= v_b2; |
|
|
|
v_a2 *= v_b2; |
|
|
|
|
|
|
|
|
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i + 4, v_a2); |
|
|
|
v_store(dst + i + v_float32::nlanes, v_a2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128_64F |
|
|
|
#if CV_SIMD_64F |
|
|
|
template <> |
|
|
|
template <> |
|
|
|
struct iPow_SIMD<double, double> |
|
|
|
struct iPow_SIMD<double, double> |
|
|
|
{ |
|
|
|
{ |
|
|
|
int operator() ( const double * src, double * dst, int len, int power) |
|
|
|
int operator() ( const double * src, double * dst, int len, int power) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0; |
|
|
|
int i = 0; |
|
|
|
v_float64x2 v_1 = v_setall_f64(1.); |
|
|
|
v_float64 v_1 = vx_setall_f64(1.); |
|
|
|
|
|
|
|
|
|
|
|
for ( ; i <= len - 4; i += 4) |
|
|
|
for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float64x2 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_float64 v_a1 = v_1, v_a2 = v_1; |
|
|
|
v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2); |
|
|
|
v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes); |
|
|
|
int p = std::abs(power); |
|
|
|
int p = std::abs(power); |
|
|
|
if( power < 0 ) |
|
|
|
if( power < 0 ) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1008,8 +1012,9 @@ struct iPow_SIMD<double, double> |
|
|
|
v_a2 *= v_b2; |
|
|
|
v_a2 *= v_b2; |
|
|
|
|
|
|
|
|
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i, v_a1); |
|
|
|
v_store(dst + i + 2, v_a2); |
|
|
|
v_store(dst + i + v_float64::nlanes, v_a2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
@ -1594,9 +1599,9 @@ void patchNaNs( InputOutputArray _a, double _val ) |
|
|
|
Cv32suf val; |
|
|
|
Cv32suf val; |
|
|
|
val.f = (float)_val; |
|
|
|
val.f = (float)_val; |
|
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 |
|
|
|
#if CV_SIMD |
|
|
|
v_int32x4 v_mask1 = v_setall_s32(0x7fffffff), v_mask2 = v_setall_s32(0x7f800000); |
|
|
|
v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000); |
|
|
|
v_int32x4 v_val = v_setall_s32(val.i); |
|
|
|
v_int32 v_val = vx_setall_s32(val.i); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
|
|
|
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
|
|
@ -1604,18 +1609,16 @@ void patchNaNs( InputOutputArray _a, double _val ) |
|
|
|
int* tptr = ptrs[0]; |
|
|
|
int* tptr = ptrs[0]; |
|
|
|
size_t j = 0; |
|
|
|
size_t j = 0; |
|
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 |
|
|
|
#if CV_SIMD |
|
|
|
if( hasSIMD128() ) |
|
|
|
size_t cWidth = (size_t)v_int32::nlanes; |
|
|
|
{ |
|
|
|
|
|
|
|
size_t cWidth = (size_t)v_int32x4::nlanes; |
|
|
|
|
|
|
|
for ( ; j + cWidth <= len; j += cWidth) |
|
|
|
for ( ; j + cWidth <= len; j += cWidth) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32x4 v_src = v_load(tptr + j); |
|
|
|
v_int32 v_src = vx_load(tptr + j); |
|
|
|
v_int32x4 v_cmp_mask = v_mask2 < (v_src & v_mask1); |
|
|
|
v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1); |
|
|
|
v_int32x4 v_dst = v_select(v_cmp_mask, v_val, v_src); |
|
|
|
v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src); |
|
|
|
v_store(tptr + j, v_dst); |
|
|
|
v_store(tptr + j, v_dst); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
vx_cleanup(); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
for( ; j < len; j++ ) |
|
|
|
for( ; j < len; j++ ) |
|
|
|