|
|
|
@ -253,6 +253,28 @@ ADD_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef ADD_SIMD |
|
|
|
|
|
|
|
|
|
#define SUB_SIMD(SRC, DST) \ |
|
|
|
|
int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length); |
|
|
|
|
|
|
|
|
|
SUB_SIMD(uchar, uchar) |
|
|
|
|
SUB_SIMD(ushort, uchar) |
|
|
|
|
SUB_SIMD(short, uchar) |
|
|
|
|
SUB_SIMD(float, uchar) |
|
|
|
|
SUB_SIMD(short, short) |
|
|
|
|
SUB_SIMD(ushort, short) |
|
|
|
|
SUB_SIMD(uchar, short) |
|
|
|
|
SUB_SIMD(float, short) |
|
|
|
|
SUB_SIMD(ushort, ushort) |
|
|
|
|
SUB_SIMD(uchar, ushort) |
|
|
|
|
SUB_SIMD(short, ushort) |
|
|
|
|
SUB_SIMD(float, ushort) |
|
|
|
|
SUB_SIMD(uchar, float) |
|
|
|
|
SUB_SIMD(ushort, float) |
|
|
|
|
SUB_SIMD(short, float) |
|
|
|
|
SUB_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef SUB_SIMD |
|
|
|
|
|
|
|
|
|
int split3_simd(const uchar in[], uchar out1[], uchar out2[], |
|
|
|
|
uchar out3[], const int width); |
|
|
|
|
|
|
|
|
@ -2530,32 +2552,43 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], |
|
|
|
|
// Fluid kernels: Add
|
|
|
|
|
//
|
|
|
|
|
//-------------------------
|
|
|
|
|
template<typename VT> |
|
|
|
|
CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b) |
|
|
|
|
{ |
|
|
|
|
return a + b; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_uint16& c1, const v_uint16& c2) |
|
|
|
|
template<typename VT> |
|
|
|
|
CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b) |
|
|
|
|
{ |
|
|
|
|
return a - b; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2) |
|
|
|
|
{ |
|
|
|
|
vx_store(outx, v_pack(c1, c2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_int16& c1, const v_int16& c2) |
|
|
|
|
CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_int16& c1, const v_int16& c2) |
|
|
|
|
{ |
|
|
|
|
vx_store(outx, v_pack_u(c1, c2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename SRC, typename DST> |
|
|
|
|
template<typename oper_tag, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<SRC, DST>::value, void>::type |
|
|
|
|
add_simd_impl(const SRC* in1x, const SRC* in2x, DST* outx) |
|
|
|
|
arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, DST* outx) |
|
|
|
|
{ |
|
|
|
|
vector_type_of_t<SRC> a = vx_load(in1x); |
|
|
|
|
vector_type_of_t<SRC> b = vx_load(in2x); |
|
|
|
|
vx_store(outx, a + b); |
|
|
|
|
vx_store(outx, oper(op, a, b)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename SRC> |
|
|
|
|
template<typename oper_tag, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<SRC, short>::value || |
|
|
|
|
std::is_same<SRC, ushort>::value, void>::type |
|
|
|
|
add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx) |
|
|
|
|
arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
@ -2564,10 +2597,12 @@ add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx) |
|
|
|
|
vector_type_of_t<SRC> b1 = vx_load(in2x); |
|
|
|
|
vector_type_of_t<SRC> b2 = vx_load(&in2x[nlanes / 2]); |
|
|
|
|
|
|
|
|
|
add_uchar_store(outx, a1 + b1, a2 + b2); |
|
|
|
|
pack_store_uchar(outx, oper(op, a1, b1), oper(op, a2, b2)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* outx) |
|
|
|
|
template<typename oper_tag> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x, |
|
|
|
|
const float* in2x, uchar* outx) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = v_uint8::nlanes; |
|
|
|
|
|
|
|
|
@ -2581,31 +2616,35 @@ CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* |
|
|
|
|
v_float32 b3 = vx_load(&in2x[2 * nlanes / 4]); |
|
|
|
|
v_float32 b4 = vx_load(&in2x[3 * nlanes / 4]); |
|
|
|
|
|
|
|
|
|
vx_store(outx, v_pack_u(v_pack(v_round(a1 + b1), v_round(a2 + b2)), |
|
|
|
|
v_pack(v_round(a3 + b3), v_round(a4 + b4)))); |
|
|
|
|
vx_store(outx, v_pack_u(v_pack(v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2))), |
|
|
|
|
v_pack(v_round(oper(op, a3, b3)), v_round(oper(op, a4, b4))))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, short* outx) |
|
|
|
|
template<typename oper_tag> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x, |
|
|
|
|
const uchar* in2x, short* outx) |
|
|
|
|
{ |
|
|
|
|
v_int16 a = v_reinterpret_as_s16(vx_load_expand(in1x)); |
|
|
|
|
v_int16 b = v_reinterpret_as_s16(vx_load_expand(in2x)); |
|
|
|
|
|
|
|
|
|
vx_store(outx, a + b); |
|
|
|
|
vx_store(outx, oper(op, a, b)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, ushort* outx) |
|
|
|
|
template<typename oper_tag> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x, |
|
|
|
|
const uchar* in2x, ushort* outx) |
|
|
|
|
{ |
|
|
|
|
v_uint16 a = vx_load_expand(in1x); |
|
|
|
|
v_uint16 b = vx_load_expand(in2x); |
|
|
|
|
|
|
|
|
|
vx_store(outx, a + b); |
|
|
|
|
vx_store(outx, oper(op, a, b)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename DST> |
|
|
|
|
template<typename oper_tag, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE |
|
|
|
|
typename std::enable_if<std::is_same<DST, short>::value || |
|
|
|
|
std::is_same<DST, ushort>::value, void>::type |
|
|
|
|
add_simd_impl(const float* in1x, const float* in2x, DST* outx) |
|
|
|
|
arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
v_float32 a1 = vx_load(in1x); |
|
|
|
@ -2613,10 +2652,12 @@ add_simd_impl(const float* in1x, const float* in2x, DST* outx) |
|
|
|
|
v_float32 b1 = vx_load(in2x); |
|
|
|
|
v_float32 b2 = vx_load(&in2x[nlanes/2]); |
|
|
|
|
|
|
|
|
|
v_store_i16(outx, v_round(a1 + b1), v_round(a2 + b2)); |
|
|
|
|
v_store_i16(outx, v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort* outx) |
|
|
|
|
template<typename oper_tag> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const short* in1x, |
|
|
|
|
const short* in2x, ushort* outx) |
|
|
|
|
{ |
|
|
|
|
v_int16 a = vx_load(in1x); |
|
|
|
|
v_int32 a1 = v_expand_low(a); |
|
|
|
@ -2626,57 +2667,66 @@ CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort |
|
|
|
|
v_int32 b1 = v_expand_low(b); |
|
|
|
|
v_int32 b2 = v_expand_high(b); |
|
|
|
|
|
|
|
|
|
vx_store(outx, v_pack_u(a1 + b1, a2 + b2)); |
|
|
|
|
vx_store(outx, v_pack_u(oper(op, a1, b1), oper(op, a2, b2))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const ushort* in1x, const ushort* in2x, short* outx) |
|
|
|
|
template<typename oper_tag> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const ushort* in1x, |
|
|
|
|
const ushort* in2x, short* outx) |
|
|
|
|
{ |
|
|
|
|
v_uint16 a = vx_load(in1x); |
|
|
|
|
v_uint32 a1 = v_expand_low(a); |
|
|
|
|
v_uint32 a2 = v_expand_high(a); |
|
|
|
|
v_int16 a = v_reinterpret_as_s16(vx_load(in1x)); |
|
|
|
|
v_int32 a1 = v_expand_low(a); |
|
|
|
|
v_int32 a2 = v_expand_high(a); |
|
|
|
|
|
|
|
|
|
v_uint16 b = vx_load(in2x); |
|
|
|
|
v_uint32 b1 = v_expand_low(b); |
|
|
|
|
v_uint32 b2 = v_expand_high(b); |
|
|
|
|
v_int16 b = v_reinterpret_as_s16(vx_load(in2x)); |
|
|
|
|
v_int32 b1 = v_expand_low(b); |
|
|
|
|
v_int32 b2 = v_expand_high(b); |
|
|
|
|
|
|
|
|
|
vx_store(outx, v_reinterpret_as_s16(v_pack(a1 + b1, a2 + b2))); |
|
|
|
|
vx_store(outx, v_pack(oper(op, a1, b1), oper(op, a2, b2))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE void add_simd_impl(const SRC* in1x, const SRC* in2x, float* outx) |
|
|
|
|
template<typename oper_tag, typename SRC> |
|
|
|
|
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, float* outx) |
|
|
|
|
{ |
|
|
|
|
v_float32 a = vg_load_f32(in1x); |
|
|
|
|
v_float32 b = vg_load_f32(in2x); |
|
|
|
|
|
|
|
|
|
vx_store(outx, a + b); |
|
|
|
|
vx_store(outx, oper(op, a, b)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename oper_tag, typename SRC, typename DST> |
|
|
|
|
CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[], |
|
|
|
|
DST out[], const int length) |
|
|
|
|
{ |
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; |
|
|
|
|
|
|
|
|
|
if (length < nlanes) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
for (; x <= length - nlanes; x += nlanes) |
|
|
|
|
{ |
|
|
|
|
arithmOp_simd_impl(op, &in1[x], &in2[x], &out[x]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (x < length) |
|
|
|
|
{ |
|
|
|
|
x = length - nlanes; |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define ADD_SIMD(SRC, DST) \ |
|
|
|
|
int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \
|
|
|
|
|
{ \
|
|
|
|
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
|
|
|
|
|
\
|
|
|
|
|
if (length < nlanes) \
|
|
|
|
|
return 0; \
|
|
|
|
|
\
|
|
|
|
|
int x = 0; \
|
|
|
|
|
for (;;) \
|
|
|
|
|
{ \
|
|
|
|
|
for (; x <= length - nlanes; x += nlanes) \
|
|
|
|
|
{ \
|
|
|
|
|
add_simd_impl(&in1[x], &in2[x], &out[x]); \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
if (x < length) \
|
|
|
|
|
{ \
|
|
|
|
|
x = length - nlanes; \
|
|
|
|
|
continue; \
|
|
|
|
|
} \
|
|
|
|
|
break; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
return x; \
|
|
|
|
|
} |
|
|
|
|
return arithmOp_simd(add_tag{}, in1, in2, out, length); \
|
|
|
|
|
} \
|
|
|
|
|
|
|
|
|
|
ADD_SIMD(uchar, uchar) |
|
|
|
|
ADD_SIMD(ushort, uchar) |
|
|
|
@ -2697,6 +2747,37 @@ ADD_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef ADD_SIMD |
|
|
|
|
|
|
|
|
|
//-------------------------
|
|
|
|
|
//
|
|
|
|
|
// Fluid kernels: Sub
|
|
|
|
|
//
|
|
|
|
|
//-------------------------
|
|
|
|
|
|
|
|
|
|
#define SUB_SIMD(SRC, DST) \ |
|
|
|
|
int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \
|
|
|
|
|
{ \
|
|
|
|
|
return arithmOp_simd(sub_tag{}, in1, in2, out, length); \
|
|
|
|
|
} \
|
|
|
|
|
|
|
|
|
|
SUB_SIMD(uchar, uchar) |
|
|
|
|
SUB_SIMD(ushort, uchar) |
|
|
|
|
SUB_SIMD(short, uchar) |
|
|
|
|
SUB_SIMD(float, uchar) |
|
|
|
|
SUB_SIMD(short, short) |
|
|
|
|
SUB_SIMD(ushort, short) |
|
|
|
|
SUB_SIMD(uchar, short) |
|
|
|
|
SUB_SIMD(float, short) |
|
|
|
|
SUB_SIMD(ushort, ushort) |
|
|
|
|
SUB_SIMD(uchar, ushort) |
|
|
|
|
SUB_SIMD(short, ushort) |
|
|
|
|
SUB_SIMD(float, ushort) |
|
|
|
|
SUB_SIMD(uchar, float) |
|
|
|
|
SUB_SIMD(ushort, float) |
|
|
|
|
SUB_SIMD(short, float) |
|
|
|
|
SUB_SIMD(float, float) |
|
|
|
|
|
|
|
|
|
#undef SUB_SIMD |
|
|
|
|
|
|
|
|
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
|
|
|
|
|
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_END |
|
|
|
|