|
|
|
@ -208,6 +208,29 @@ ABSDIFFC_SIMD(float) |
|
|
|
|
|
|
|
|
|
#undef ABSDIFFC_SIMD |
|
|
|
|
|
|
|
|
|
#define DIVRC_SIMD(SRC, DST) \ |
|
|
|
|
int divrc_simd(const float scalar[], const SRC in[], DST out[], \
|
|
|
|
|
const int length, const int chan, const float scale); |
|
|
|
|
|
|
|
|
|
DIVRC_SIMD(uchar, uchar) |
|
|
|
|
DIVRC_SIMD(ushort, uchar) |
|
|
|
|
DIVRC_SIMD(short, uchar) |
|
|
|
|
DIVRC_SIMD(float, uchar) |
|
|
|
|
DIVRC_SIMD(short, short) |
|
|
|
|
DIVRC_SIMD(ushort, short) |
|
|
|
|
DIVRC_SIMD(uchar, short) |
|
|
|
|
DIVRC_SIMD(float, short) |
|
|
|
|
DIVRC_SIMD(ushort, ushort) |
|
|
|
|
DIVRC_SIMD(uchar, ushort) |
|
|
|
|
DIVRC_SIMD(short, ushort) |
|
|
|
|
DIVRC_SIMD(float, ushort) |
|
|
|
|
DIVRC_SIMD(uchar, float) |
|
|
|
|
DIVRC_SIMD(ushort, float) |
|
|
|
|
DIVRC_SIMD(short, float) |
|
|
|
|
DIVRC_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef DIVRC_SIMD |
|
|
|
|
|
|
|
|
|
int split3_simd(const uchar in[], uchar out1[], uchar out2[], |
|
|
|
|
uchar out3[], const int width); |
|
|
|
|
|
|
|
|
@ -236,6 +259,28 @@ template<> struct vector_type_of<ushort> { using type = v_uint16; }; |
|
|
|
|
template<> struct vector_type_of<short> { using type = v_int16; }; |
|
|
|
|
template<> struct vector_type_of<float> { using type = v_float32; }; |
|
|
|
|
|
|
|
|
|
template<typename scalar_t> |
|
|
|
|
struct zero_vec_type_of; |
|
|
|
|
|
|
|
|
|
template<typename scalar_t> |
|
|
|
|
using zero_vec_type_of_t = typename zero_vec_type_of<scalar_t>::type; |
|
|
|
|
|
|
|
|
|
template<> struct zero_vec_type_of<uchar> { using type = v_int16; }; |
|
|
|
|
template<> struct zero_vec_type_of<ushort> { using type = v_int16; }; |
|
|
|
|
template<> struct zero_vec_type_of<short> { using type = v_int16; }; |
|
|
|
|
template<> struct zero_vec_type_of<float> { using type = v_float32; }; |
|
|
|
|
|
|
|
|
|
template<typename scalar_t> |
|
|
|
|
struct univ_zero_vec_type_of; |
|
|
|
|
|
|
|
|
|
template<typename scalar_t> |
|
|
|
|
using univ_zero_vec_type_of_t = typename univ_zero_vec_type_of<scalar_t>::type; |
|
|
|
|
|
|
|
|
|
template<> struct univ_zero_vec_type_of<uchar> { using type = v_uint8; }; |
|
|
|
|
template<> struct univ_zero_vec_type_of<ushort> { using type = v_int16; }; |
|
|
|
|
template<> struct univ_zero_vec_type_of<short> { using type = v_int16; }; |
|
|
|
|
template<> struct univ_zero_vec_type_of<float> { using type = v_float32; }; |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in) |
|
|
|
|
{ |
|
|
|
|
return vx_load(in); |
|
|
|
@ -295,143 +340,85 @@ CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int |
|
|
|
|
CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero, |
|
|
|
|
const v_int32& res1, const v_int32& res2) |
|
|
|
|
{ |
|
|
|
|
v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2))); |
|
|
|
|
vx_store(dst, sel); |
|
|
|
|
vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero), |
|
|
|
|
v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//=================================================================================================
|
|
|
|
|
//=============================================================================
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type |
|
|
|
|
div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale) |
|
|
|
|
void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, |
|
|
|
|
const v_float32& a3, const v_float32& a4, const uchar* in2x, |
|
|
|
|
uchar* outx, const v_float32& v_scale, const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_int16 v_zero = vx_setall_s16(0); |
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x])); |
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x)); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2])); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); |
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); |
|
|
|
|
|
|
|
|
|
v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); |
|
|
|
|
v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); |
|
|
|
|
v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)), |
|
|
|
|
sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)), |
|
|
|
|
sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), |
|
|
|
|
sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); |
|
|
|
|
|
|
|
|
|
v_store_select(&out[x], div, v_zero, r1, r2); |
|
|
|
|
} |
|
|
|
|
v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); |
|
|
|
|
v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process one more time (unaligned tail)
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
vx_store(outx, v_pack_u(res1, res2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<SRC, short>::value || |
|
|
|
|
std::is_same<SRC, ushort>::value, int>::type |
|
|
|
|
div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale) |
|
|
|
|
std::is_same<SRC, ushort>::value, void>::type |
|
|
|
|
div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, |
|
|
|
|
const v_float32& a3, const v_float32& a4, const SRC* in2x, |
|
|
|
|
uchar* outx, const v_float32& v_scale, const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
v_int16 v_zero = vx_setall_s16(0); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); |
|
|
|
|
v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); |
|
|
|
|
v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x])); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2])); |
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x)); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2])); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); |
|
|
|
|
|
|
|
|
|
v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)), |
|
|
|
|
sum2 = v_round(div_op(t, a2, fdiv2, scale)), |
|
|
|
|
sum3 = v_round(div_op(t, a3, fdiv3, scale)), |
|
|
|
|
sum4 = v_round(div_op(t, a4, fdiv4, scale)); |
|
|
|
|
v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)), |
|
|
|
|
sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)), |
|
|
|
|
sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), |
|
|
|
|
sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); |
|
|
|
|
|
|
|
|
|
v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); |
|
|
|
|
v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); |
|
|
|
|
|
|
|
|
|
vx_store(&out[x], v_pack_u(res1, res2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process one more time (unaligned tail)
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
vx_store(outx, v_pack_u(res1, res2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[], |
|
|
|
|
const int length, double _scale) |
|
|
|
|
CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, |
|
|
|
|
const v_float32& a2, const v_float32& a3, |
|
|
|
|
const v_float32& a4, const float* in2x, uchar* outx, |
|
|
|
|
const v_float32& v_scale, const v_float32& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
v_float32 v_zero = vx_setall_f32(0); |
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); |
|
|
|
|
v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); |
|
|
|
|
v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
v_float32 div1 = vg_load_f32(&in2[x]); |
|
|
|
|
v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]); |
|
|
|
|
v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]); |
|
|
|
|
v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]); |
|
|
|
|
v_float32 div1 = vg_load_f32(in2x); |
|
|
|
|
v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]); |
|
|
|
|
v_float32 div3 = vg_load_f32(&in2x[nlanes / 2]); |
|
|
|
|
v_float32 div4 = vg_load_f32(&in2x[3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
v_float32 r1 = div_op(t, a1, div1, scale); |
|
|
|
|
v_float32 r2 = div_op(t, a2, div2, scale); |
|
|
|
|
v_float32 r3 = div_op(t, a3, div3, scale); |
|
|
|
|
v_float32 r4 = div_op(t, a4, div4, scale); |
|
|
|
|
v_float32 r1 = div_op(s_tag, a1, div1, v_scale); |
|
|
|
|
v_float32 r2 = div_op(s_tag, a2, div2, v_scale); |
|
|
|
|
v_float32 r3 = div_op(s_tag, a3, div3, v_scale); |
|
|
|
|
v_float32 r4 = div_op(s_tag, a4, div4, v_scale); |
|
|
|
|
|
|
|
|
|
v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1); |
|
|
|
|
v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2); |
|
|
|
@ -443,17 +430,21 @@ CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[] |
|
|
|
|
v_int32 res3 = v_round(sel3); |
|
|
|
|
v_int32 res4 = v_round(sel4); |
|
|
|
|
|
|
|
|
|
vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); |
|
|
|
|
} |
|
|
|
|
vx_store(outx, v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process one more time (unaligned tail)
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
template<typename scale_tag_t, typename SRC, typename Vtype> |
|
|
|
|
CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx, |
|
|
|
|
const v_float32& v_scale, const Vtype& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
v_float32 a1 = vg_load_f32(in1x); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]); |
|
|
|
|
v_float32 a3 = vg_load_f32(&in1x[nlanes / 2]); |
|
|
|
|
v_float32 a4 = vg_load_f32(&in1x[3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
div_simd_impl(s_tag, a1, a2, a3, a4, in2x, outx, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
@ -461,113 +452,117 @@ CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[] |
|
|
|
|
template<typename scale_tag_t, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, int>::type |
|
|
|
|
div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale) |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, |
|
|
|
|
const uchar* in2x, DST* outx, const v_float32& v_scale, |
|
|
|
|
const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
v_int16 div = v_reinterpret_as_s16(vx_load_expand(in2x)); |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); |
|
|
|
|
|
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
v_int16 v_zero = vx_setall_s16(0); |
|
|
|
|
v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)); |
|
|
|
|
v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); |
|
|
|
|
v_store_select(outx, div, v_zero, r1, r2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x])); |
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, short>::value && std::is_same<DST, short>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), void>::type |
|
|
|
|
div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, |
|
|
|
|
const SRC* in2x, DST* outx, const v_float32& v_scale, const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
v_int16 div = v_reinterpret_as_s16(vx_load(in2x)); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); |
|
|
|
|
|
|
|
|
|
v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); |
|
|
|
|
v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); |
|
|
|
|
v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)); |
|
|
|
|
v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)); |
|
|
|
|
|
|
|
|
|
v_store_select(&out[x], div, v_zero, r1, r2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process one more time (unaligned tail)
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
v_store_select(outx, div, v_zero, r1, r2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, int>::type |
|
|
|
|
div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale) |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, |
|
|
|
|
const float* in2x, DST* outx, const v_float32& v_scale, |
|
|
|
|
const v_float32& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
v_float32 fdiv1 = vg_load_f32(in2x); |
|
|
|
|
v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]); |
|
|
|
|
|
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
v_float32 v_zero = vx_setall_f32(0); |
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = vg_load_f32(&in2[x]); |
|
|
|
|
v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]); |
|
|
|
|
|
|
|
|
|
v_float32 r1 = div_op(t, a1, fdiv1, scale); |
|
|
|
|
v_float32 r2 = div_op(t, a2, fdiv2, scale); |
|
|
|
|
v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale); |
|
|
|
|
v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale); |
|
|
|
|
|
|
|
|
|
v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); |
|
|
|
|
v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); |
|
|
|
|
|
|
|
|
|
v_store_i16(&out[x], res1, res2); |
|
|
|
|
} |
|
|
|
|
v_store_i16(outx, res1, res2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process one more time (unaligned tail)
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST, typename Vtype> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx, |
|
|
|
|
const v_float32& v_scale, const Vtype& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
v_float32 a1 = vg_load_f32(in1x); |
|
|
|
|
v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]); |
|
|
|
|
|
|
|
|
|
div_simd_impl(s_tag, a1, a2, in2x, outx, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[], |
|
|
|
|
const int length, double _scale) |
|
|
|
|
CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const SRC* in2x, |
|
|
|
|
float* outx, const v_float32& v_scale) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_float32::nlanes; |
|
|
|
|
v_float32 b1 = vg_load_f32(in2x); |
|
|
|
|
vx_store(outx, div_op(s_tag, a1, b1, v_scale)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename Tvec> |
|
|
|
|
CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, float* outx, |
|
|
|
|
const v_float32& v_scale, const Tvec&) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(in1x); |
|
|
|
|
div_simd_impl(s_tag, a1, in2x, outx, v_scale); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[], |
|
|
|
|
DST out[], const int length, float scale) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_float32 scale = vx_setall_f32(static_cast<float>(_scale)); |
|
|
|
|
const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); |
|
|
|
|
v_float32 v_scale = vx_setall_f32(scale); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 a1 = vg_load_f32(&in1[x]); |
|
|
|
|
v_float32 b1 = vg_load_f32(&in2[x]); |
|
|
|
|
|
|
|
|
|
vx_store(&out[x], div_op(t, a1, b1, scale)); |
|
|
|
|
div_hal(s_tag, &in1[x], &in2[x], &out[x], v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
@ -580,28 +575,6 @@ CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], fl |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[], |
|
|
|
|
const int length, double scale) |
|
|
|
|
{ |
|
|
|
|
hal::div8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length), |
|
|
|
|
out, static_cast<size_t>(length), length, 1, &scale); |
|
|
|
|
return length; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[], |
|
|
|
|
const int length, double scale) |
|
|
|
|
{ |
|
|
|
|
hal::div16s(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length), |
|
|
|
|
out, static_cast<size_t>(length), length, 1, &scale); |
|
|
|
|
return length; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
#define DIV_SIMD(SRC, DST) \ |
|
|
|
|
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
|
|
|
|
|
const int length, double _scale) \
|
|
|
|
@ -610,13 +583,11 @@ int div_simd(const SRC in1[], const SRC in2[], DST out[], |
|
|
|
|
float fscale = static_cast<float>(_scale); \
|
|
|
|
|
if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \
|
|
|
|
|
{ \
|
|
|
|
|
not_scale_tag t; \
|
|
|
|
|
x = div_hal(t, in1, in2, out, length, _scale); \
|
|
|
|
|
x = div_simd_common(not_scale_tag{}, in1, in2, out, length, fscale); \
|
|
|
|
|
} \
|
|
|
|
|
else \
|
|
|
|
|
{ \
|
|
|
|
|
scale_tag t; \
|
|
|
|
|
x = div_hal(t, in1, in2, out, length, _scale); \
|
|
|
|
|
x = div_simd_common(scale_tag{}, in1, in2, out, length, fscale); \
|
|
|
|
|
} \
|
|
|
|
|
return x; \
|
|
|
|
|
} |
|
|
|
@ -1976,14 +1947,432 @@ ABSDIFFC_SIMD(float) |
|
|
|
|
|
|
|
|
|
#undef ABSDIFFC_SIMD |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST, typename Tvec> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, |
|
|
|
|
const v_float32& v_scalar, DST* outx, |
|
|
|
|
const v_float32& v_scale, const Tvec& v_zero) |
|
|
|
|
{ |
|
|
|
|
div_simd_impl(s_tag, v_scalar, v_scalar, inx, outx, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST, typename Tvec> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, uchar>::value, void>::type |
|
|
|
|
divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, |
|
|
|
|
const v_float32& v_scalar, DST* outx, |
|
|
|
|
const v_float32& v_scale, const Tvec& v_zero) |
|
|
|
|
{ |
|
|
|
|
div_simd_impl(s_tag, v_scalar, v_scalar, v_scalar, v_scalar, inx, outx, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST, typename Tvec> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, float>::value, void>::type |
|
|
|
|
divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, |
|
|
|
|
const v_float32& v_scalar, DST* outx, |
|
|
|
|
const v_float32& v_scale, const Tvec&) |
|
|
|
|
{ |
|
|
|
|
div_simd_impl(s_tag, v_scalar, inx, outx, v_scale); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[], |
|
|
|
|
const float scalar[], DST out[], |
|
|
|
|
const int length, const float scale) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_float32 v_scalar = vx_load(scalar); |
|
|
|
|
v_float32 v_scale = vx_setall_f32(scale); |
|
|
|
|
zero_vec_type_of_t<SRC> v_zero = |
|
|
|
|
vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
divrc_simd_common_impl(s_tag, &in[x], v_scalar, &out[x], v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; // process unaligned tail
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uchar* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_uint8& v_zero) |
|
|
|
|
{ |
|
|
|
|
v_uint8 div = vx_load(inx); |
|
|
|
|
v_uint8 v_mask = (div == v_zero); |
|
|
|
|
|
|
|
|
|
v_uint16 div1 = v_expand_low(div); |
|
|
|
|
v_uint16 div2 = v_expand_high(div); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div1))); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div1))); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div2))); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div2))); |
|
|
|
|
|
|
|
|
|
vx_store(outx, |
|
|
|
|
v_select(v_mask, v_zero, v_pack_u(v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s2, fdiv2, v_scale))), |
|
|
|
|
v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s1, fdiv4, v_scale)))))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<SRC, short>::value || |
|
|
|
|
std::is_same<SRC, ushort>::value, void>::type |
|
|
|
|
divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2])); |
|
|
|
|
|
|
|
|
|
v_int16 v_mask1 = (div1 == v_zero); |
|
|
|
|
v_int16 v_mask2 = (div2 == v_zero); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); |
|
|
|
|
|
|
|
|
|
vx_store(outx, |
|
|
|
|
v_pack_u(v_select(v_mask1, v_zero, |
|
|
|
|
v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s2, fdiv2, v_scale)))), |
|
|
|
|
v_select(v_mask2, v_zero, |
|
|
|
|
v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s1, fdiv4, v_scale)))))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t> |
|
|
|
|
CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uchar* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_float32& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = vg_load_f32(inx); |
|
|
|
|
v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]); |
|
|
|
|
v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]); |
|
|
|
|
v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
v_float32 v_mask1 = (fdiv1 == v_zero); |
|
|
|
|
v_float32 v_mask2 = (fdiv2 == v_zero); |
|
|
|
|
v_float32 v_mask3 = (fdiv3 == v_zero); |
|
|
|
|
v_float32 v_mask4 = (fdiv4 == v_zero); |
|
|
|
|
|
|
|
|
|
vx_store(outx, |
|
|
|
|
v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), |
|
|
|
|
v_round(v_select(v_mask2, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))), |
|
|
|
|
v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, s3, fdiv3, v_scale))), |
|
|
|
|
v_round(v_select(v_mask4, v_zero, div_op(s_tag, s1, fdiv4, v_scale)))))); |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar out[], |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const int length, const int nlanes, const int lanes) |
|
|
|
|
{ |
|
|
|
|
univ_zero_vec_type_of_t<SRC> v_zero = |
|
|
|
|
vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - lanes; x += lanes) |
|
|
|
|
{ |
|
|
|
|
divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero); |
|
|
|
|
divrc_simd_c3_calc(s_tag, &in[x + nlanes], &out[x + nlanes], s2, s3, s1, v_scale, v_zero); |
|
|
|
|
divrc_simd_c3_calc(s_tag, &in[x + 2 * nlanes], &out[x + 2 * nlanes], s3, s1, s2, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - lanes; |
|
|
|
|
continue; // process unaligned tail
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
v_uint8 div = vx_load(inx); |
|
|
|
|
|
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div)); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(v_expand_high(div)); |
|
|
|
|
v_int16 div3 = v_reinterpret_as_s16(vx_load_expand(&inx[2 * nlanes])); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); |
|
|
|
|
v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3)); |
|
|
|
|
v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3)); |
|
|
|
|
|
|
|
|
|
v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s2, fdiv2, v_scale))); |
|
|
|
|
v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s1, fdiv4, v_scale))); |
|
|
|
|
v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s3, fdiv6, v_scale))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) || |
|
|
|
|
(std::is_same<SRC, short>::value && std::is_same<DST, short>::value) || |
|
|
|
|
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), void>::type |
|
|
|
|
divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_int16& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); |
|
|
|
|
v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes])); |
|
|
|
|
v_int16 div3 = v_reinterpret_as_s16(vx_load(&inx[2*nlanes])); |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); |
|
|
|
|
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); |
|
|
|
|
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); |
|
|
|
|
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); |
|
|
|
|
v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3)); |
|
|
|
|
v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3)); |
|
|
|
|
|
|
|
|
|
v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s2, fdiv2, v_scale))); |
|
|
|
|
v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s1, fdiv4, v_scale))); |
|
|
|
|
v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)), |
|
|
|
|
v_round(div_op(s_tag, s3, fdiv6, v_scale))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const v_float32& v_zero) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
v_float32 fdiv1 = vg_load_f32(inx); |
|
|
|
|
v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]); |
|
|
|
|
v_float32 fdiv3 = vg_load_f32(&inx[nlanes]); |
|
|
|
|
v_float32 fdiv4 = vg_load_f32(&inx[3*nlanes/2]); |
|
|
|
|
v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]); |
|
|
|
|
v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]); |
|
|
|
|
|
|
|
|
|
v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), |
|
|
|
|
v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))); |
|
|
|
|
v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))), |
|
|
|
|
v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale)))); |
|
|
|
|
v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))), |
|
|
|
|
v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale)))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, int>::type |
|
|
|
|
divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32& s1, |
|
|
|
|
const v_float32& s2, const v_float32& s3, |
|
|
|
|
const v_float32& v_scale, const int length, |
|
|
|
|
const int, const int lanes) |
|
|
|
|
{ |
|
|
|
|
zero_vec_type_of_t<SRC> v_zero = |
|
|
|
|
vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - lanes; x += lanes) |
|
|
|
|
{ |
|
|
|
|
divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - lanes; |
|
|
|
|
continue; // process unaligned tail
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, float* out, |
|
|
|
|
const v_float32& s1, const v_float32& s2, |
|
|
|
|
const v_float32& s3, const v_float32& v_scale, |
|
|
|
|
const int length, const int nlanes, const int lanes) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - lanes; x += lanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 div1 = vg_load_f32(&in[x]); |
|
|
|
|
v_float32 div2 = vg_load_f32(&in[x + nlanes]); |
|
|
|
|
v_float32 div3 = vg_load_f32(&in[x + 2*nlanes]); |
|
|
|
|
|
|
|
|
|
vx_store(&out[x], div_op(s_tag, s1, div1, v_scale)); |
|
|
|
|
vx_store(&out[x + nlanes], div_op(s_tag, s2, div2, v_scale)); |
|
|
|
|
vx_store(&out[x + 2*nlanes], div_op(s_tag, s3, div3, v_scale)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - lanes; |
|
|
|
|
continue; // process unaligned tail
|
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
template<typename scale_tag_t, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[], |
|
|
|
|
const float scalar[], DST out[], |
|
|
|
|
const int length, const float scale) |
|
|
|
|
{ |
|
|
|
|
constexpr int chan = 3; |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
constexpr int lanes = chan * nlanes; |
|
|
|
|
|
|
|
|
|
if (length < lanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
v_float32 v_scale = vx_setall_f32(scale); |
|
|
|
|
|
|
|
|
|
v_float32 s1 = vx_load(scalar); |
|
|
|
|
#if CV_SIMD_WIDTH == 32 |
|
|
|
|
v_float32 s2 = vx_load(&scalar[2]); |
|
|
|
|
v_float32 s3 = vx_load(&scalar[1]); |
|
|
|
|
#else |
|
|
|
|
v_float32 s2 = vx_load(&scalar[1]); |
|
|
|
|
v_float32 s3 = vx_load(&scalar[2]); |
|
|
|
|
#endif |
|
|
|
|
return divrc_simd_c3_impl(s_tag, in, out, s1, s2, s3, v_scale, length, nlanes, lanes); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define DIVRC_SIMD(SRC, DST) \ |
|
|
|
|
int divrc_simd(const float scalar[], const SRC in[], DST out[], \
|
|
|
|
|
const int length, const int chan, const float scale) \
|
|
|
|
|
{ \
|
|
|
|
|
switch (chan) \
|
|
|
|
|
{ \
|
|
|
|
|
case 1: \
|
|
|
|
|
case 2: \
|
|
|
|
|
case 4: \
|
|
|
|
|
{ \
|
|
|
|
|
if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
|
|
|
|
|
{ \
|
|
|
|
|
return divrc_simd_common(not_scale_tag{}, in, scalar, \
|
|
|
|
|
out, length, scale); \
|
|
|
|
|
} \
|
|
|
|
|
else \
|
|
|
|
|
{ \
|
|
|
|
|
return divrc_simd_common(scale_tag{}, in, scalar, out, \
|
|
|
|
|
length, scale); \
|
|
|
|
|
} \
|
|
|
|
|
} \
|
|
|
|
|
case 3: \
|
|
|
|
|
{ \
|
|
|
|
|
if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
|
|
|
|
|
{ \
|
|
|
|
|
return divrc_simd_c3(not_scale_tag{}, in, scalar, \
|
|
|
|
|
out, length, scale); \
|
|
|
|
|
} \
|
|
|
|
|
else \
|
|
|
|
|
{ \
|
|
|
|
|
return divrc_simd_c3(scale_tag{}, in, scalar, out, \
|
|
|
|
|
length, scale); \
|
|
|
|
|
} \
|
|
|
|
|
} \
|
|
|
|
|
default: \
|
|
|
|
|
GAPI_Assert(chan <= 4); \
|
|
|
|
|
break; \
|
|
|
|
|
} \
|
|
|
|
|
return 0; \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
DIVRC_SIMD(uchar, uchar) |
|
|
|
|
DIVRC_SIMD(ushort, uchar) |
|
|
|
|
DIVRC_SIMD(short, uchar) |
|
|
|
|
DIVRC_SIMD(float, uchar) |
|
|
|
|
DIVRC_SIMD(short, short) |
|
|
|
|
DIVRC_SIMD(ushort, short) |
|
|
|
|
DIVRC_SIMD(uchar, short) |
|
|
|
|
DIVRC_SIMD(float, short) |
|
|
|
|
DIVRC_SIMD(ushort, ushort) |
|
|
|
|
DIVRC_SIMD(uchar, ushort) |
|
|
|
|
DIVRC_SIMD(short, ushort) |
|
|
|
|
DIVRC_SIMD(float, ushort) |
|
|
|
|
DIVRC_SIMD(uchar, float) |
|
|
|
|
DIVRC_SIMD(ushort, float) |
|
|
|
|
DIVRC_SIMD(short, float) |
|
|
|
|
DIVRC_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef DIVRC_SIMD |
|
|
|
|
|
|
|
|
|
//-------------------------
|
|
|
|
|
//
|
|
|
|
|
// Fluid kernels: Split3
|
|
|
|
|
//
|
|
|
|
|
//-------------------------
|
|
|
|
|
|
|
|
|
|
int split3_simd(const uchar in[], uchar out1[], uchar out2[], |
|
|
|
|
uchar out3[], const int width) |
|
|
|
|
int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], |
|
|
|
|
const int width) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
if (width < nlanes) |
|
|
|
|