|
|
@ -587,6 +587,7 @@ struct RowVec_8u32s |
|
|
|
i += v_uint32::nlanes; |
|
|
|
i += v_uint32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1083,6 +1084,7 @@ struct SymmRowSmallVec_8u32s |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1106,6 +1108,8 @@ struct SymmColumnVec_32s8u |
|
|
|
int operator()(const uchar** _src, uchar* dst, int width) const |
|
|
|
int operator()(const uchar** _src, uchar* dst, int width) const |
|
|
|
{ |
|
|
|
{ |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
|
|
|
|
if( _ksize == 1 ) |
|
|
|
|
|
|
|
return 0; |
|
|
|
int ksize2 = _ksize/2; |
|
|
|
int ksize2 = _ksize/2; |
|
|
|
const float* ky = kernel.ptr<float>() + ksize2; |
|
|
|
const float* ky = kernel.ptr<float>() + ksize2; |
|
|
|
int i = 0, k; |
|
|
|
int i = 0, k; |
|
|
@ -1115,9 +1119,8 @@ struct SymmColumnVec_32s8u |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
if( symmetrical ) |
|
|
|
if( symmetrical ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (_ksize == 1) |
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
v_float32 f0 = vx_setall_f32(ky[0]); |
|
|
|
v_float32 f0 = vx_setall_f32(ky[0]); |
|
|
|
|
|
|
|
v_float32 f1 = vx_setall_f32(ky[1]); |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int* S = src[0] + i; |
|
|
|
const int* S = src[0] + i; |
|
|
@ -1125,11 +1128,17 @@ struct SymmColumnVec_32s8u |
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
const int* S0 = src[1] + i; |
|
|
|
|
|
|
|
const int* S1 = src[-1] + i; |
|
|
|
|
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
const int* S0 = src[k] + i; |
|
|
|
S0 = src[k] + i; |
|
|
|
const int* S1 = src[-k] + i; |
|
|
|
S1 = src[-k] + i; |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); |
|
|
|
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); |
|
|
@ -1142,11 +1151,15 @@ struct SymmColumnVec_32s8u |
|
|
|
const int* S = src[0] + i; |
|
|
|
const int* S = src[0] + i; |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); |
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); |
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
const int* S0 = src[1] + i; |
|
|
|
|
|
|
|
const int* S1 = src[-1] + i; |
|
|
|
|
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
const int* S0 = src[k] + i; |
|
|
|
S0 = src[k] + i; |
|
|
|
const int* S1 = src[-k] + i; |
|
|
|
S1 = src[-k] + i; |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
} |
|
|
|
} |
|
|
@ -1160,7 +1173,8 @@ struct SymmColumnVec_32s8u |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); |
|
|
|
v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
@ -1170,17 +1184,20 @@ struct SymmColumnVec_32s8u |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 f1 = vx_setall_f32(ky[1]); |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
const int* S0 = src[1] + i; |
|
|
|
v_float32 s1 = d4; |
|
|
|
const int* S1 = src[-1] + i; |
|
|
|
v_float32 s2 = d4; |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); |
|
|
|
v_float32 s3 = d4; |
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); |
|
|
|
for ( k = 1; k <= ksize2; k++ ) |
|
|
|
v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4); |
|
|
|
|
|
|
|
for ( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
const int* S0 = src[k] + i; |
|
|
|
S0 = src[k] + i; |
|
|
|
const int* S1 = src[-k] + i; |
|
|
|
S1 = src[-k] + i; |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); |
|
|
|
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); |
|
|
@ -1190,13 +1207,15 @@ struct SymmColumnVec_32s8u |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
const int* S0 = src[1] + i; |
|
|
|
v_float32 s1 = d4; |
|
|
|
const int* S1 = src[-1] + i; |
|
|
|
for ( k = 1; k <= ksize2; k++ ) |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); |
|
|
|
|
|
|
|
for ( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
v_float32 f = vx_setall_f32(ky[k]); |
|
|
|
const int* S0 = src[k] + i; |
|
|
|
S0 = src[k] + i; |
|
|
|
const int* S1 = src[-k] + i; |
|
|
|
S1 = src[-k] + i; |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); |
|
|
|
} |
|
|
|
} |
|
|
@ -1209,8 +1228,8 @@ struct SymmColumnVec_32s8u |
|
|
|
if( i <= width - v_int32x4::nlanes ) |
|
|
|
if( i <= width - v_int32x4::nlanes ) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32x4 s0 = v_setall_f32(delta); |
|
|
|
v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta)); |
|
|
|
for (k = 1; k <= ksize2; k++) |
|
|
|
for (k = 2; k <= ksize2; k++) |
|
|
|
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
@ -1219,6 +1238,7 @@ struct SymmColumnVec_32s8u |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1250,57 +1270,104 @@ struct SymmColumnSmallVec_32s16s |
|
|
|
short* dst = (short*)_dst; |
|
|
|
short* dst = (short*)_dst; |
|
|
|
|
|
|
|
|
|
|
|
v_float32 df4 = vx_setall_f32(delta); |
|
|
|
v_float32 df4 = vx_setall_f32(delta); |
|
|
|
v_int32 d4 = v_round(df4); |
|
|
|
int d = cvRound(delta); |
|
|
|
|
|
|
|
v_int16 d8 = vx_setall_s16((short)d); |
|
|
|
if( symmetrical ) |
|
|
|
if( symmetrical ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( ky[0] == 2 && ky[1] == 1 ) |
|
|
|
if( ky[0] == 2 && ky[1] == 1 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_int32 s0 = vx_load(S1 + i); |
|
|
|
|
|
|
|
v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
|
|
|
|
v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); |
|
|
|
|
|
|
|
v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2), |
|
|
|
|
|
|
|
vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32 sl = vx_load(S1 + i); |
|
|
|
v_int32 sl = vx_load(S1 + i); |
|
|
|
v_int32 sh = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
v_int32 sh = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh))); |
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32 s = vx_load(S1 + i); |
|
|
|
v_int32 s = vx_load(S1 + i); |
|
|
|
v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s)); |
|
|
|
v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s)); |
|
|
|
i += v_int32::nlanes; |
|
|
|
i += v_int32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else if( ky[0] == -2 && ky[1] == 1 ) |
|
|
|
else if( ky[0] == -2 && ky[1] == 1 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_int32 s0 = vx_load(S1 + i); |
|
|
|
|
|
|
|
v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
|
|
|
|
v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); |
|
|
|
|
|
|
|
v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0), |
|
|
|
|
|
|
|
vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2), |
|
|
|
|
|
|
|
vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32 sl = vx_load(S1 + i); |
|
|
|
v_int32 sl = vx_load(S1 + i); |
|
|
|
v_int32 sh = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
v_int32 sh = vx_load(S1 + i + v_int32::nlanes); |
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh))); |
|
|
|
v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32 s = vx_load(S1 + i); |
|
|
|
v_int32 s = vx_load(S1 + i); |
|
|
|
v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s)); |
|
|
|
v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s)); |
|
|
|
i += v_int32::nlanes; |
|
|
|
i += v_int32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#if CV_NEON |
|
|
|
else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) |
|
|
|
else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); |
|
|
|
v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
v_int32 d4 = vx_setall_s32(d); |
|
|
|
|
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), |
|
|
|
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), |
|
|
|
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); |
|
|
|
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)), |
|
|
|
|
|
|
|
v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4)))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), |
|
|
|
|
|
|
|
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); |
|
|
|
v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); |
|
|
|
i += v_int32::nlanes; |
|
|
|
i += v_int32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); |
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), |
|
|
|
|
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))), |
|
|
|
|
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4))))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), |
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), |
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); |
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); |
|
|
|
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); |
|
|
@ -1314,20 +1381,38 @@ struct SymmColumnSmallVec_32s16s |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( ky[1] < 0 ) |
|
|
|
if( ky[1] < 0 ) |
|
|
|
std::swap(S0, S2); |
|
|
|
std::swap(S0, S2); |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4)); |
|
|
|
{ |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); |
|
|
|
v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d)); |
|
|
|
i += v_int32::nlanes; |
|
|
|
i += v_int32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 k1 = vx_setall_f32(ky[1]); |
|
|
|
v_float32 k1 = vx_setall_f32(ky[1]); |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), |
|
|
|
|
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)), |
|
|
|
|
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4)))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), |
|
|
|
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), |
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); |
|
|
|
v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); |
|
|
|
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); |
|
|
@ -1336,6 +1421,7 @@ struct SymmColumnSmallVec_32s16s |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1362,19 +1448,43 @@ struct RowVec_16s32f |
|
|
|
const float* _kx = kernel.ptr<float>(); |
|
|
|
const float* _kx = kernel.ptr<float>(); |
|
|
|
width *= cn; |
|
|
|
width *= cn; |
|
|
|
|
|
|
|
|
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const short* src = (const short*)_src + i; |
|
|
|
|
|
|
|
v_float32 s0 = vx_setzero_f32(); |
|
|
|
|
|
|
|
v_float32 s1 = vx_setzero_f32(); |
|
|
|
|
|
|
|
v_float32 s2 = vx_setzero_f32(); |
|
|
|
|
|
|
|
v_float32 s3 = vx_setzero_f32(); |
|
|
|
|
|
|
|
for( k = 0; k < _ksize; k++, src += cn ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 f = vx_setall_f32(_kx[k]); |
|
|
|
|
|
|
|
v_int16 xl = vx_load(src); |
|
|
|
|
|
|
|
v_int16 xh = vx_load(src + v_int16::nlanes); |
|
|
|
|
|
|
|
s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
v_store(dst + i + 2*v_float32::nlanes, s2); |
|
|
|
|
|
|
|
v_store(dst + i + 3*v_float32::nlanes, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const short* src = (const short*)_src + i; |
|
|
|
const short* src = (const short*)_src + i; |
|
|
|
v_float32 s0 = vx_setzero_f32(); |
|
|
|
v_float32 s0 = vx_setzero_f32(); |
|
|
|
v_float32 s1 = vx_setzero_f32(); |
|
|
|
v_float32 s1 = vx_setzero_f32(); |
|
|
|
for( k = 0; k < _ksize; k++, src += cn ) |
|
|
|
for( k = 0; k < _ksize; k++, src += cn ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 f = vx_setall_f32(_kx[k]); |
|
|
|
v_int16 x = vx_load(src); |
|
|
|
v_int16 x = vx_load(src); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1); |
|
|
|
} |
|
|
|
} |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1385,6 +1495,7 @@ struct RowVec_16s32f |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
i += v_float32::nlanes; |
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1406,6 +1517,8 @@ struct SymmColumnVec_32f16s |
|
|
|
int operator()(const uchar** _src, uchar* _dst, int width) const |
|
|
|
int operator()(const uchar** _src, uchar* _dst, int width) const |
|
|
|
{ |
|
|
|
{ |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
|
|
|
|
if( _ksize == 1 ) |
|
|
|
|
|
|
|
return 0; |
|
|
|
int ksize2 = _ksize / 2; |
|
|
|
int ksize2 = _ksize / 2; |
|
|
|
const float* ky = kernel.ptr<float>() + ksize2; |
|
|
|
const float* ky = kernel.ptr<float>() + ksize2; |
|
|
|
int i = 0, k; |
|
|
|
int i = 0, k; |
|
|
@ -1416,25 +1529,49 @@ struct SymmColumnVec_32f16s |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
if( symmetrical ) |
|
|
|
if( symmetrical ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (_ksize == 1) |
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]); |
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]); |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
v_float32 k1 = vx_setall_f32(ky[1]); |
|
|
|
|
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); |
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 k1 = vx_setall_f32(ky[k]); |
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); |
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); |
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
} |
|
|
|
} |
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
i += v_float32::nlanes; |
|
|
|
i += v_float32::nlanes; |
|
|
@ -1442,28 +1579,48 @@ struct SymmColumnVec_32f16s |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
|
for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) |
|
|
|
v_float32 k1 = vx_setall_f32(ky[1]); |
|
|
|
|
|
|
|
for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
v_float32 s1 = d4; |
|
|
|
v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 k1 = vx_setall_f32(ky[k]); |
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); |
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1); |
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
|
|
|
|
v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_int16::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
} |
|
|
|
} |
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
v_store(dst + i, v_pack(v_round(s0), v_round(s1))); |
|
|
|
|
|
|
|
i += v_int16::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
i += v_float32::nlanes; |
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1505,6 +1662,7 @@ struct RowVec_32f |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
int _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
|
|
|
|
CV_DbgAssert(_ksize > 0); |
|
|
|
const float* src0 = (const float*)_src; |
|
|
|
const float* src0 = (const float*)_src; |
|
|
|
float* dst = (float*)_dst; |
|
|
|
float* dst = (float*)_dst; |
|
|
|
const float* _kx = kernel.ptr<float>(); |
|
|
|
const float* _kx = kernel.ptr<float>(); |
|
|
@ -1516,14 +1674,55 @@ struct RowVec_32f |
|
|
|
if (haveAVX2) |
|
|
|
if (haveAVX2) |
|
|
|
return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); |
|
|
|
return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
v_float32 k0 = vx_setall_f32(_kx[0]); |
|
|
|
|
|
|
|
for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const float* src = src0 + i; |
|
|
|
const float* src = src0 + i; |
|
|
|
v_float32 s0 = vx_setzero_f32(); |
|
|
|
v_float32 s0 = vx_load(src) * k0; |
|
|
|
for( k = 0; k < _ksize; k++, src += cn ) |
|
|
|
v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; |
|
|
|
|
|
|
|
v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0; |
|
|
|
|
|
|
|
v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0; |
|
|
|
|
|
|
|
src += cn; |
|
|
|
|
|
|
|
for( k = 1; k < _ksize; k++, src += cn ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k1 = vx_setall_f32(_kx[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
v_store(dst + i + 2*v_float32::nlanes, s2); |
|
|
|
|
|
|
|
v_store(dst + i + 3*v_float32::nlanes, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - 2*v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const float* src = src0 + i; |
|
|
|
|
|
|
|
v_float32 s0 = vx_load(src) * k0; |
|
|
|
|
|
|
|
v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; |
|
|
|
|
|
|
|
src += cn; |
|
|
|
|
|
|
|
for( k = 1; k < _ksize; k++, src += cn ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k1 = vx_setall_f32(_kx[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
i += 2*v_float32::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const float* src = src0 + i; |
|
|
|
|
|
|
|
v_float32 s0 = vx_load(src) * k0; |
|
|
|
|
|
|
|
src += cn; |
|
|
|
|
|
|
|
for( k = 1; k < _ksize; k++, src += cn ) |
|
|
|
s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1584,6 +1783,8 @@ struct SymmRowSmallVec_32f |
|
|
|
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const |
|
|
|
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i = 0, _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
int i = 0, _ksize = kernel.rows + kernel.cols - 1; |
|
|
|
|
|
|
|
if( _ksize == 1 ) |
|
|
|
|
|
|
|
return 0; |
|
|
|
float* dst = (float*)_dst; |
|
|
|
float* dst = (float*)_dst; |
|
|
|
const float* src = (const float*)_src + (_ksize/2)*cn; |
|
|
|
const float* src = (const float*)_src + (_ksize/2)*cn; |
|
|
|
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; |
|
|
|
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; |
|
|
@ -1592,15 +1793,28 @@ struct SymmRowSmallVec_32f |
|
|
|
|
|
|
|
|
|
|
|
if( symmetrical ) |
|
|
|
if( symmetrical ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( _ksize == 1 ) |
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
if( _ksize == 3 ) |
|
|
|
if( _ksize == 3 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( fabs(kx[0]) == 2 && kx[1] == 1 ) |
|
|
|
if( fabs(kx[0]) == 2 && kx[1] == 1 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_FMA3 || CV_AVX2 |
|
|
|
v_float32 k0 = vx_setall_f32(kx[0]); |
|
|
|
v_float32 k0 = vx_setall_f32(kx[0]); |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); |
|
|
|
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
if( kx[0] > 0 ) |
|
|
|
|
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 x = vx_load(src); |
|
|
|
|
|
|
|
v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 x = vx_load(src); |
|
|
|
|
|
|
|
v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1613,9 +1827,17 @@ struct SymmRowSmallVec_32f |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) |
|
|
|
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_FMA3 || CV_AVX2 |
|
|
|
v_float32 k0 = vx_setall_f32(-2); |
|
|
|
v_float32 k0 = vx_setall_f32(-2); |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); |
|
|
|
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 x = vx_load(src); |
|
|
|
|
|
|
|
v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1647,6 +1869,7 @@ struct SymmRowSmallVec_32f |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1688,12 +1911,47 @@ struct SymmColumnVec_32f |
|
|
|
return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); |
|
|
|
return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
const v_float32 d4 = vx_setall_f32(delta); |
|
|
|
const v_float32 d4 = vx_setall_f32(delta); |
|
|
|
for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
const v_float32 k0 = vx_setall_f32(ky[0]); |
|
|
|
|
|
|
|
for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k1 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
v_store(dst + i + 2*v_float32::nlanes, s2); |
|
|
|
|
|
|
|
v_store(dst + i + 3*v_float32::nlanes, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - 2*v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); |
|
|
|
|
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k1 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
i += 2*v_float32::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4); |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
@ -1702,16 +1960,53 @@ struct SymmColumnVec_32f |
|
|
|
if (haveAVX2) |
|
|
|
if (haveAVX2) |
|
|
|
return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); |
|
|
|
return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
CV_DbgAssert(ksize2 > 0); |
|
|
|
const v_float32 d4 = vx_setall_f32(delta); |
|
|
|
const v_float32 d4 = vx_setall_f32(delta); |
|
|
|
for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
const v_float32 k1 = vx_setall_f32(ky[1]); |
|
|
|
|
|
|
|
for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
for( k = 1; k <= ksize2; k++ ) |
|
|
|
v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
v_store(dst + i + 2*v_float32::nlanes, s2); |
|
|
|
|
|
|
|
v_store(dst + i + 3*v_float32::nlanes, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - 2*v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 k2 = vx_setall_f32(ky[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
i += 2*v_float32::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); |
|
|
|
|
|
|
|
for( k = 2; k <= ksize2; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1748,9 +2043,24 @@ struct SymmColumnSmallVec_32f |
|
|
|
{ |
|
|
|
{ |
|
|
|
if( fabs(ky[0]) == 2 && ky[1] == 1 ) |
|
|
|
if( fabs(ky[0]) == 2 && ky[1] == 1 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_FMA3 || CV_AVX2 |
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]); |
|
|
|
v_float32 k0 = vx_setall_f32(ky[0]); |
|
|
|
for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); |
|
|
|
v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
if(ky[0] > 0) |
|
|
|
|
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 x = vx_load(S1 + i); |
|
|
|
|
|
|
|
v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 x = vx_load(S1 + i); |
|
|
|
|
|
|
|
v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1776,6 +2086,7 @@ struct SymmColumnSmallVec_32f |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1804,19 +2115,27 @@ struct FilterVec_8u |
|
|
|
|
|
|
|
|
|
|
|
int operator()(const uchar** src, uchar* dst, int width) const |
|
|
|
int operator()(const uchar** src, uchar* dst, int width) const |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
CV_DbgAssert(_nz > 0); |
|
|
|
const float* kf = (const float*)&coeffs[0]; |
|
|
|
const float* kf = (const float*)&coeffs[0]; |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
|
|
|
|
|
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
|
|
|
|
v_float32 f0 = vx_setall_f32(kf[0]); |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; |
|
|
|
v_uint16 xl, xh; |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
v_expand(vx_load(src[0] + i), xl, xh); |
|
|
|
|
|
|
|
v_uint32 x0, x1, x2, x3; |
|
|
|
|
|
|
|
v_expand(xl, x0, x1); |
|
|
|
|
|
|
|
v_expand(xh, x2, x3); |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); |
|
|
|
|
|
|
|
v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_uint16 xl, xh; |
|
|
|
|
|
|
|
v_expand(vx_load(src[k] + i), xl, xh); |
|
|
|
v_expand(vx_load(src[k] + i), xl, xh); |
|
|
|
v_uint32 x0, x1, x2, x3; |
|
|
|
|
|
|
|
v_expand(xl, x0, x1); |
|
|
|
v_expand(xl, x0, x1); |
|
|
|
v_expand(xh, x2, x3); |
|
|
|
v_expand(xh, x2, x3); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); |
|
|
@ -1828,11 +2147,13 @@ struct FilterVec_8u |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4, s1 = d4; |
|
|
|
v_uint32 x0, x1; |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
v_expand(vx_load_expand(src[0] + i), x0, x1); |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_uint32 x0, x1; |
|
|
|
|
|
|
|
v_expand(vx_load_expand(src[k] + i), x0, x1); |
|
|
|
v_expand(vx_load_expand(src[k] + i), x0, x1); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); |
|
|
@ -1846,8 +2167,8 @@ struct FilterVec_8u |
|
|
|
if( i <= width - v_int32x4::nlanes ) |
|
|
|
if( i <= width - v_int32x4::nlanes ) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32x4 s0 = v_setall_f32(delta); |
|
|
|
v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta)); |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int32x4 s32 = v_round(s0); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
|
v_int16x8 s16 = v_pack(s32, s32); |
|
|
@ -1855,6 +2176,7 @@ struct FilterVec_8u |
|
|
|
i += v_int32x4::nlanes; |
|
|
|
i += v_int32x4::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1879,18 +2201,24 @@ struct FilterVec_8u16s |
|
|
|
|
|
|
|
|
|
|
|
int operator()(const uchar** src, uchar* _dst, int width) const |
|
|
|
int operator()(const uchar** src, uchar* _dst, int width) const |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
CV_DbgAssert(_nz > 0); |
|
|
|
const float* kf = (const float*)&coeffs[0]; |
|
|
|
const float* kf = (const float*)&coeffs[0]; |
|
|
|
short* dst = (short*)_dst; |
|
|
|
short* dst = (short*)_dst; |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
|
|
|
|
|
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
|
|
|
|
v_float32 f0 = vx_setall_f32(kf[0]); |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; |
|
|
|
v_uint16 xl, xh; |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
v_expand(vx_load(src[0] + i), xl, xh); |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4); |
|
|
|
|
|
|
|
v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_uint16 xl, xh; |
|
|
|
|
|
|
|
v_expand(vx_load(src[k] + i), xl, xh); |
|
|
|
v_expand(vx_load(src[k] + i), xl, xh); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); |
|
|
@ -1902,11 +2230,13 @@ struct FilterVec_8u16s |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
if( i <= width - v_uint16::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4, s1 = d4; |
|
|
|
v_uint16 x = vx_load_expand(src[0] + i); |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_float32 f = vx_setall_f32(kf[k]); |
|
|
|
v_uint16 x = vx_load_expand(src[k] + i); |
|
|
|
x = vx_load_expand(src[k] + i); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); |
|
|
|
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); |
|
|
|
} |
|
|
|
} |
|
|
@ -1915,13 +2245,14 @@ struct FilterVec_8u16s |
|
|
|
} |
|
|
|
} |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
if( i <= width - v_int32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4); |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); |
|
|
|
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
v_pack_store(dst + i, v_round(s0)); |
|
|
|
i += v_int32::nlanes; |
|
|
|
i += v_int32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -1950,14 +2281,50 @@ struct FilterVec_32f |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
int i = 0, k, nz = _nz; |
|
|
|
|
|
|
|
|
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
v_float32 d4 = vx_setall_f32(delta); |
|
|
|
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) |
|
|
|
v_float32 f0 = vx_setall_f32(kf[0]); |
|
|
|
|
|
|
|
for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_float32 s0 = d4; |
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); |
|
|
|
for( k = 0; k < nz; k++ ) |
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); |
|
|
|
|
|
|
|
v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4); |
|
|
|
|
|
|
|
v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 f1 = vx_setall_f32(kf[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i), f1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); |
|
|
|
|
|
|
|
s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2); |
|
|
|
|
|
|
|
s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
v_store(dst + i + 2*v_float32::nlanes, s2); |
|
|
|
|
|
|
|
v_store(dst + i + 3*v_float32::nlanes, s3); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - 2*v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); |
|
|
|
|
|
|
|
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 f1 = vx_setall_f32(kf[k]); |
|
|
|
|
|
|
|
s0 = v_muladd(vx_load(src[k] + i), f1, s0); |
|
|
|
|
|
|
|
s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
v_store(dst + i + v_float32::nlanes, s1); |
|
|
|
|
|
|
|
i += 2*v_float32::nlanes; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if( i <= width - v_float32::nlanes ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); |
|
|
|
|
|
|
|
for( k = 1; k < nz; k++ ) |
|
|
|
s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); |
|
|
|
s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
v_store(dst + i, s0); |
|
|
|
|
|
|
|
i += v_float32::nlanes; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vx_cleanup(); |
|
|
|
return i; |
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|