Merge pull request #24324 from hanliutong:rewrite-fluid

Rewrite Universal Intrinsic code: gapi module (fluid part).
pull/24371/head
Alexander Smorkalov 1 year ago committed by GitHub
commit cd7cbe3d41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 60
      modules/gapi/src/backends/fluid/gfluidcore.cpp
  2. 2
      modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
  3. 2
      modules/gapi/src/backends/fluid/gfluidcore_func.hpp
  4. 202
      modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
  5. 461
      modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

@ -13,7 +13,7 @@
#include <opencv2/core/hal/hal.hpp> #include <opencv2/core/hal/hal.hpp>
#include <opencv2/core/hal/intrin.hpp> #include <opencv2/core/hal/intrin.hpp>
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
#include "gfluidcore_func.hpp" #include "gfluidcore_func.hpp"
#endif #endif
@ -113,7 +113,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
// Fluid kernels: addWeighted // Fluid kernels: addWeighted
// //
//--------------------------- //---------------------------
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in) CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
{ {
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in))); return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@ -150,8 +150,8 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)), ((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
"This templated overload is only for short and ushort type combinations."); "This templated overload is only for short and ushort type combinations.");
constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) : const int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(VTraits<v_uint16>::vlanes()) :
static_cast<int>(v_int16::nlanes); static_cast<int>(VTraits<v_int16>::vlanes());
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -189,7 +189,7 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
const float _alpha, const float _beta, const float _alpha, const float _beta,
const float _gamma, int length) const float _gamma, int length)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -298,7 +298,7 @@ GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE }; enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x) CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x)
{ {
vx_store(&out[x], v_absdiffs(a, b)); vx_store(&out[x], v_absdiffs(a, b));
@ -322,7 +322,7 @@ CV_ALWAYS_INLINE void absdiff_store(float out[], const v_float32& a, const v_flo
template<typename T, typename VT> template<typename T, typename VT>
CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length) CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length)
{ {
constexpr int nlanes = static_cast<int>(VT::nlanes); const int nlanes = static_cast<int>(VTraits<VT>::vlanes());
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -403,7 +403,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
{ {
case ARITHM_ADD: case ARITHM_ADD:
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = add_simd(in1, in2, out, length); x = add_simd(in1, in2, out, length);
#endif #endif
for (; x < length; ++x) for (; x < length; ++x)
@ -412,7 +412,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
} }
case ARITHM_SUBTRACT: case ARITHM_SUBTRACT:
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = sub_simd(in1, in2, out, length); x = sub_simd(in1, in2, out, length);
#endif #endif
for (; x < length; ++x) for (; x < length; ++x)
@ -421,7 +421,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
} }
case ARITHM_MULTIPLY: case ARITHM_MULTIPLY:
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = mul_simd(in1, in2, out, length, scale); x = mul_simd(in1, in2, out, length, scale);
#endif #endif
for (; x < length; ++x) for (; x < length; ++x)
@ -430,7 +430,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
} }
case ARITHM_DIVIDE: case ARITHM_DIVIDE:
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = div_simd(in1, in2, out, length, scale); x = div_simd(in1, in2, out, length, scale);
#endif #endif
for (; x < length; ++x) for (; x < length; ++x)
@ -569,7 +569,7 @@ static void run_absdiff(Buffer &dst, const View &src1, const View &src2)
int x = 0; int x = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = absdiff_simd(in1, in2, out, length); x = absdiff_simd(in1, in2, out, length);
#endif #endif
for (; x < length; ++x) for (; x < length; ++x)
@ -660,7 +660,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_ADD: case ARITHM_ADD:
{ {
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = addc_simd(in, scalar, out, length, chan); w = addc_simd(in, scalar, out, length, chan);
#endif #endif
for (; w < length; ++w) for (; w < length; ++w)
@ -671,7 +671,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_SUBTRACT: case ARITHM_SUBTRACT:
{ {
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = subc_simd(in, scalar, out, length, chan); w = subc_simd(in, scalar, out, length, chan);
#endif #endif
for (; w < length; ++w) for (; w < length; ++w)
@ -681,7 +681,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_MULTIPLY: case ARITHM_MULTIPLY:
{ {
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = mulc_simd(in, scalar, out, length, chan, scale); w = mulc_simd(in, scalar, out, length, chan, scale);
#endif #endif
for (; w < width; ++w) for (; w < width; ++w)
@ -709,7 +709,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
case ARITHM_SUBTRACT: case ARITHM_SUBTRACT:
{ {
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = subrc_simd(scalar, in, out, length, chan); w = subrc_simd(scalar, in, out, length, chan);
#endif #endif
for (; w < length; ++w) for (; w < length; ++w)
@ -721,7 +721,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
case ARITHM_DIVIDE: case ARITHM_DIVIDE:
{ {
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = divrc_simd(scalar, in, out, length, chan, scale); w = divrc_simd(scalar, in, out, length, chan, scale);
#endif #endif
for (; w < length; ++w) for (; w < length; ++w)
@ -744,7 +744,7 @@ CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)
CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16; constexpr int maxNlanes = 16;
@ -783,7 +783,7 @@ CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float sca
const int length = width * chan; const int length = width * chan;
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = absdiffc_simd(in, scalar, out, length, chan); w = absdiffc_simd(in, scalar, out, length, chan);
#endif #endif
@ -1076,7 +1076,7 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
const int length = width * chan; const int length = width * chan;
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
int scratch_length = scratch.length(); int scratch_length = scratch.length();
int indicator_offset = scratch_length - 1; int indicator_offset = scratch_length - 1;
const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset))); const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
@ -1143,7 +1143,7 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)
static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch) static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
{ {
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
// 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain. // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
constexpr int maxNlanes = 16; constexpr int maxNlanes = 16;
@ -1565,7 +1565,7 @@ template<typename SRC, typename DST>
CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length) CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
{ {
int x = 0; int x = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = convertto_simd(in, out, length); x = convertto_simd(in, out, length);
#endif #endif
// tail of SIMD cycle // tail of SIMD cycle
@ -1580,7 +1580,7 @@ CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha,
const int length) const int length)
{ {
int x = 0; int x = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
x = convertto_scaled_simd(in, out, alpha, beta, length); x = convertto_scaled_simd(in, out, alpha, beta, length);
#endif #endif
@ -2096,9 +2096,7 @@ static void run_inrange3(uchar out[], const uchar in[], int width,
v_load_deinterleave(&in[3*w], i0, i1, i2); v_load_deinterleave(&in[3*w], i0, i1, i2);
v_uint8x16 o; v_uint8x16 o;
o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) & o = v_and(v_and(v_and(v_and(v_and(v_ge(i0, v_setall_u8(lower[0])), v_le(i0, v_setall_u8(upper[0]))), v_ge(i1, v_setall_u8(lower[1]))), v_le(i1, v_setall_u8(upper[1]))), v_ge(i2, v_setall_u8(lower[2]))), v_le(i2, v_setall_u8(upper[2])));
(i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
(i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
v_store(&out[w], o); v_store(&out[w], o);
} }
@ -2226,7 +2224,7 @@ static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uc
v_load_deinterleave(&in2[3*w], a2, b2, c2); v_load_deinterleave(&in2[3*w], a2, b2, c2);
mask = v_load(&in3[w]); mask = v_load(&in3[w]);
mask = mask != v_setzero_u8(); mask = v_ne(mask, v_setzero_u8());
a = v_select(mask, a1, a2); a = v_select(mask, a1, a2);
b = v_select(mask, b1, b2); b = v_select(mask, b1, b2);
@ -2332,7 +2330,7 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
int width = src.length(); int width = src.length();
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = split3_simd(in, out1, out2, out3, width); w = split3_simd(in, out1, out2, out3, width);
#endif #endif
@ -2364,7 +2362,7 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
int width = src.length(); int width = src.length();
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = split4_simd(in, out1, out2, out3, out4, width); w = split4_simd(in, out1, out2, out3, out4, width);
#endif #endif
@ -2389,7 +2387,7 @@ CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2
int width = dst.length(); int width = dst.length();
int w = 0; int w = 0;
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = merge3_simd(in1, in2, in3, out, width); w = merge3_simd(in1, in2, in3, out, width);
#endif #endif
@ -2442,7 +2440,7 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
int w = 0; // cycle counter int w = 0; // cycle counter
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
w = merge4_simd(in1, in2, in3, in4, out, width); w = merge4_simd(in1, in2, in3, in4, out, width);
#endif #endif

@ -7,7 +7,7 @@
#if !defined(GAPI_STANDALONE) #if !defined(GAPI_STANDALONE)
#include <opencv2/core/hal/intrin.hpp> #include <opencv2/core/hal/intrin.hpp>
#if CV_SIMD #if (CV_SIMD || CV_SIMD_SCALABLE)
#include "gfluidcore_func.hpp" #include "gfluidcore_func.hpp"
#include "gfluidcore_func.simd.hpp" #include "gfluidcore_func.simd.hpp"

@ -6,7 +6,7 @@
#pragma once #pragma once
#if !defined(GAPI_STANDALONE) && CV_SIMD #if !defined(GAPI_STANDALONE) && (CV_SIMD || CV_SIMD_SCALABLE)
#include <opencv2/core.hpp> #include <opencv2/core.hpp>

@ -402,22 +402,22 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale) CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
{ {
return (scale*a * b); return (v_mul(v_mul(scale, a), b));
} }
CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&) CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
{ {
return a * b; return v_mul(a, b);
} }
CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale) CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
{ {
return (a*scale/div); return (v_div(v_mul(a, scale), div));
} }
CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&) CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
{ {
return a / div; return v_div(a, div);
} }
CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2) CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2)
@ -433,13 +433,13 @@ CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int3
CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero, CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
const v_int32& res1, const v_int32& res2) const v_int32& res1, const v_int32& res2)
{ {
vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2))); vx_store(dst, v_select(v_eq(div, v_zero), v_zero, v_pack(res1, res2)));
} }
CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero, CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
const v_int32& res1, const v_int32& res2) const v_int32& res1, const v_int32& res2)
{ {
vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero), vx_store(dst, v_select(v_reinterpret_as_u16(v_eq(div, v_zero)),
v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2))); v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2)));
} }
@ -451,7 +451,7 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
const v_float32& a3, const v_float32& a4, const uchar* in2x, const v_float32& a3, const v_float32& a4, const uchar* in2x,
uchar* outx, const v_float32& v_scale, const v_int16& v_zero) uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x)); v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x));
v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2])); v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2]));
@ -466,8 +466,8 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
vx_store(outx, v_pack_u(res1, res2)); vx_store(outx, v_pack_u(res1, res2));
} }
@ -480,7 +480,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
const v_float32& a3, const v_float32& a4, const SRC* in2x, const v_float32& a3, const v_float32& a4, const SRC* in2x,
uchar* outx, const v_float32& v_scale, const v_int16& v_zero) uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x)); v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x));
v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2])); v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2]));
@ -495,8 +495,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
vx_store(outx, v_pack_u(res1, res2)); vx_store(outx, v_pack_u(res1, res2));
} }
@ -507,7 +507,7 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
const v_float32& a4, const float* in2x, uchar* outx, const v_float32& a4, const float* in2x, uchar* outx,
const v_float32& v_scale, const v_float32& v_zero) const v_float32& v_scale, const v_float32& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 div1 = vg_load_f32(in2x); v_float32 div1 = vg_load_f32(in2x);
v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]); v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]);
@ -519,10 +519,10 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
v_float32 r3 = div_op(s_tag, a3, div3, v_scale); v_float32 r3 = div_op(s_tag, a3, div3, v_scale);
v_float32 r4 = div_op(s_tag, a4, div4, v_scale); v_float32 r4 = div_op(s_tag, a4, div4, v_scale);
v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1); v_float32 sel1 = v_select((v_eq(div1, v_zero)), v_zero, r1);
v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2); v_float32 sel2 = v_select((v_eq(div2, v_zero)), v_zero, r2);
v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3); v_float32 sel3 = v_select((v_eq(div3, v_zero)), v_zero, r3);
v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4); v_float32 sel4 = v_select((v_eq(div4, v_zero)), v_zero, r4);
v_int32 res1 = v_round(sel1); v_int32 res1 = v_round(sel1);
v_int32 res2 = v_round(sel2); v_int32 res2 = v_round(sel2);
@ -536,7 +536,7 @@ template<typename scale_tag_t, typename SRC, typename Vtype>
CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx, CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx,
const v_float32& v_scale, const Vtype& v_zero) const v_float32& v_scale, const Vtype& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 a1 = vg_load_f32(in1x); v_float32 a1 = vg_load_f32(in1x);
v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]); v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]);
@ -595,7 +595,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
const float* in2x, DST* outx, const v_float32& v_scale, const float* in2x, DST* outx, const v_float32& v_scale,
const v_float32& v_zero) const v_float32& v_zero)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 fdiv1 = vg_load_f32(in2x); v_float32 fdiv1 = vg_load_f32(in2x);
v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]); v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]);
@ -603,8 +603,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale); v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale);
v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale); v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale);
v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); v_int32 res1 = v_round(v_select((v_eq(fdiv1, v_zero)), v_zero, r1));
v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); v_int32 res2 = v_round(v_select((v_eq(fdiv2, v_zero)), v_zero, r2));
v_store_i16(outx, res1, res2); v_store_i16(outx, res1, res2);
} }
@ -616,7 +616,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx, div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx,
const v_float32& v_scale, const Vtype& v_zero) const v_float32& v_scale, const Vtype& v_zero)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 a1 = vg_load_f32(in1x); v_float32 a1 = vg_load_f32(in1x);
v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]); v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]);
@ -648,12 +648,12 @@ template<typename scale_tag_t, typename SRC, typename DST>
CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[], CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[],
DST out[], const int length, float scale) DST out[], const int length, float scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename VTraits< zero_vec_type_of_t<SRC> >::lane_type>(0);
v_float32 v_scale = vx_setall_f32(scale); v_float32 v_scale = vx_setall_f32(scale);
int x = 0; int x = 0;
@ -724,7 +724,7 @@ typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, us
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale) mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -769,7 +769,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
std::is_same<SRC, ushort>::value, int>::type std::is_same<SRC, ushort>::value, int>::type
mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale) mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -824,7 +824,7 @@ template<typename scale_tag_t>
CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[], CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
const int length, double _scale) const int length, double _scale)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -869,7 +869,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type std::is_same<DST, ushort>::value, int>::type
mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale) mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -914,7 +914,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type std::is_same<DST, ushort>::value, int>::type
mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale) mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -954,7 +954,7 @@ template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[], CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
const int length, double _scale) const int length, double _scale)
{ {
constexpr int nlanes = v_float32::nlanes; const int nlanes = VTraits<v_float32>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -1049,7 +1049,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_in
const v_int32& c4, const v_int32& c5, const v_int32& c4, const v_int32& c5,
const v_int32& c6) const v_int32& c6)
{ {
constexpr int nlanes = v_int16::nlanes; const int nlanes = VTraits<v_int16>::vlanes();
vx_store(outx, v_pack(c1, c2)); vx_store(outx, v_pack(c1, c2));
vx_store(&outx[nlanes], v_pack(c3, c4)); vx_store(&outx[nlanes], v_pack(c3, c4));
vx_store(&outx[2*nlanes], v_pack(c5, c6)); vx_store(&outx[2*nlanes], v_pack(c5, c6));
@ -1060,7 +1060,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx, const v_in
const v_int32& c4, const v_int32& c5, const v_int32& c4, const v_int32& c5,
const v_int32& c6) const v_int32& c6)
{ {
constexpr int nlanes = v_uint16::nlanes; const int nlanes = VTraits<v_uint16>::vlanes();
vx_store(outx, v_pack_u(c1, c2)); vx_store(outx, v_pack_u(c1, c2));
vx_store(&outx[nlanes], v_pack_u(c3, c4)); vx_store(&outx[nlanes], v_pack_u(c3, c4));
vx_store(&outx[2*nlanes], v_pack_u(c5, c6)); vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
@ -1068,37 +1068,37 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx, const v_in
CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
{ {
return a + sc; return v_add(a, sc);
} }
CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
{ {
return a - sc; return v_sub(a, sc);
} }
CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc)
{ {
return sc - a; return v_sub(sc, a);
} }
CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
{ {
return a * sc; return v_mul(a, sc);
} }
CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale) CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
{ {
return v_scale * a * v_scalar; return v_mul(v_mul(v_scale, a), v_scalar);
} }
CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc)
{ {
return a / sc; return v_div(a, sc);
} }
CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale) CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
{ {
return a*v_scale / v_scalar; return v_div(v_mul(a, v_scale), v_scalar);
} }
CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc) CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
@ -1223,8 +1223,8 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
const int length) const int length)
{ {
constexpr int chan = 3; constexpr int chan = 3;
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
constexpr int lanes = chan * nlanes; const int lanes = chan * nlanes;
if (length < lanes) if (length < lanes)
return 0; return 0;
@ -1263,7 +1263,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
const float scalar[], DST out[], const float scalar[], DST out[],
const int length) const int length)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -1489,8 +1489,8 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int chan = 3; constexpr int chan = 3;
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
constexpr int lanes = chan * nlanes; const int lanes = chan * nlanes;
if (length < lanes) if (length < lanes)
return 0; return 0;
@ -1576,7 +1576,7 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[
const float scalar[], DST out[], const float scalar[], DST out[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -1675,10 +1675,10 @@ divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[],
const v_float32& v_scalar, const v_float32& v_scale, const v_float32& v_scalar, const v_float32& v_scale,
const int length) const int length)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 v_zero = vx_setzero_f32(); v_float32 v_zero = vx_setzero_f32();
v_float32 v_mask = (v_scalar == v_zero); v_float32 v_mask = (v_eq(v_scalar, v_zero));
int x = 0; int x = 0;
for (;;) for (;;)
@ -1709,10 +1709,10 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
uchar out[], const v_float32& v_scalar, uchar out[], const v_float32& v_scalar,
const v_float32& v_scale, const int length) const v_float32& v_scale, const int length)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 v_zero = vx_setzero_f32(); v_float32 v_zero = vx_setzero_f32();
v_float32 v_mask = (v_scalar == v_zero); v_float32 v_mask = (v_eq(v_scalar, v_zero));
int x = 0; int x = 0;
for (;;) for (;;)
@ -1747,7 +1747,7 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
float out[], const v_float32& v_scalar, float out[], const v_float32& v_scalar,
const v_float32& v_scale, const int length) const v_float32& v_scale, const int length)
{ {
constexpr int nlanes = v_float32::nlanes; const int nlanes = VTraits<v_float32>::vlanes();
int x = 0; int x = 0;
for (;;) for (;;)
{ {
@ -1774,7 +1774,7 @@ CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[],
const float scalar[], DST out[], const float scalar[], DST out[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -1796,9 +1796,9 @@ divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1,
const int nlanes, const int lanes) const int nlanes, const int lanes)
{ {
v_float32 v_zero = vx_setzero_f32(); v_float32 v_zero = vx_setzero_f32();
v_float32 v_mask1 = (s1 == v_zero); v_float32 v_mask1 = (v_eq(s1, v_zero));
v_float32 v_mask2 = (s2 == v_zero); v_float32 v_mask2 = (v_eq(s2, v_zero));
v_float32 v_mask3 = (s3 == v_zero); v_float32 v_mask3 = (v_eq(s3, v_zero));
int x = 0; int x = 0;
for (;;) for (;;)
@ -1839,9 +1839,9 @@ CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar*
const int length, const int nlanes, const int lanes) const int length, const int nlanes, const int lanes)
{ {
v_float32 v_zero = vx_setzero_f32(); v_float32 v_zero = vx_setzero_f32();
v_float32 v_mask1 = (s1 == v_zero); v_float32 v_mask1 = (v_eq(s1, v_zero));
v_float32 v_mask2 = (s2 == v_zero); v_float32 v_mask2 = (v_eq(s2, v_zero));
v_float32 v_mask3 = (s3 == v_zero); v_float32 v_mask3 = (v_eq(s3, v_zero));
int x = 0; int x = 0;
for (;;) for (;;)
@ -1917,8 +1917,8 @@ CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int chan = 3; constexpr int chan = 3;
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
constexpr int lanes = chan * nlanes; const int lanes = chan * nlanes;
if (length < lanes) if (length < lanes)
return 0; return 0;
@ -2084,7 +2084,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
const float scalar[], DST out[], const float scalar[], DST out[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -2092,7 +2092,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
v_float32 v_scalar = vx_load(scalar); v_float32 v_scalar = vx_load(scalar);
v_float32 v_scale = vx_setall_f32(scale); v_float32 v_scale = vx_setall_f32(scale);
zero_vec_type_of_t<SRC> v_zero = zero_vec_type_of_t<SRC> v_zero =
vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
int x = 0; int x = 0;
for (;;) for (;;)
@ -2121,7 +2121,7 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uc
const v_uint8& v_zero) const v_uint8& v_zero)
{ {
v_uint8 div = vx_load(inx); v_uint8 div = vx_load(inx);
v_uint8 v_mask = (div == v_zero); v_uint8 v_mask = (v_eq(div, v_zero));
v_uint16 div1 = v_expand_low(div); v_uint16 div1 = v_expand_low(div);
v_uint16 div2 = v_expand_high(div); v_uint16 div2 = v_expand_high(div);
@ -2147,13 +2147,13 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx,
const v_float32& s3, const v_float32& v_scale, const v_float32& s3, const v_float32& v_scale,
const v_int16& v_zero) const v_int16& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2])); v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
v_int16 v_mask1 = (div1 == v_zero); v_int16 v_mask1 = (v_eq(div1, v_zero));
v_int16 v_mask2 = (div2 == v_zero); v_int16 v_mask2 = (v_eq(div2, v_zero));
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
@ -2175,17 +2175,17 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uc
const v_float32& s3, const v_float32& v_scale, const v_float32& s3, const v_float32& v_scale,
const v_float32& v_zero) const v_float32& v_zero)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 fdiv1 = vg_load_f32(inx); v_float32 fdiv1 = vg_load_f32(inx);
v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]); v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]);
v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]); v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]);
v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]); v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]);
v_float32 v_mask1 = (fdiv1 == v_zero); v_float32 v_mask1 = (v_eq(fdiv1, v_zero));
v_float32 v_mask2 = (fdiv2 == v_zero); v_float32 v_mask2 = (v_eq(fdiv2, v_zero));
v_float32 v_mask3 = (fdiv3 == v_zero); v_float32 v_mask3 = (v_eq(fdiv3, v_zero));
v_float32 v_mask4 = (fdiv4 == v_zero); v_float32 v_mask4 = (v_eq(fdiv4, v_zero));
vx_store(outx, vx_store(outx,
v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
@ -2202,7 +2202,7 @@ CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar
const int length, const int nlanes, const int lanes) const int length, const int nlanes, const int lanes)
{ {
univ_zero_vec_type_of_t<SRC> v_zero = univ_zero_vec_type_of_t<SRC> v_zero =
vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0); vx_setall<typename VTraits<univ_zero_vec_type_of_t<SRC>>::lane_type>(0);
int x = 0; int x = 0;
for (;;) for (;;)
@ -2235,7 +2235,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx,
const v_float32& s3, const v_float32& v_scale, const v_float32& s3, const v_float32& v_scale,
const v_int16& v_zero) const v_int16& v_zero)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_uint8 div = vx_load(inx); v_uint8 div = vx_load(inx);
v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div)); v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div));
@ -2268,7 +2268,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx,
const v_float32& s3, const v_float32& v_scale, const v_float32& s3, const v_float32& v_scale,
const v_int16& v_zero) const v_int16& v_zero)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes])); v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes]));
@ -2298,7 +2298,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
const v_float32& s3, const v_float32& v_scale, const v_float32& s3, const v_float32& v_scale,
const v_float32& v_zero) const v_float32& v_zero)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 fdiv1 = vg_load_f32(inx); v_float32 fdiv1 = vg_load_f32(inx);
v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]); v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]);
@ -2307,12 +2307,12 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]); v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]);
v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]); v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]);
v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), v_store_i16(outx, v_round(v_select(v_eq(fdiv1, v_zero), v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))); v_round(v_select(v_eq(fdiv2, v_zero), v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))), v_store_i16(&outx[nlanes], v_round(v_select(v_eq(fdiv3, v_zero), v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale)))); v_round(v_select(v_eq(fdiv4, v_zero), v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))), v_store_i16(&outx[2*nlanes], v_round(v_select(v_eq(fdiv5, v_zero), v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale)))); v_round(v_select(v_eq(fdiv6, v_zero), v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
} }
template<typename scale_tag_t, typename SRC, typename DST> template<typename scale_tag_t, typename SRC, typename DST>
@ -2325,7 +2325,7 @@ divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32
const int, const int lanes) const int, const int lanes)
{ {
zero_vec_type_of_t<SRC> v_zero = zero_vec_type_of_t<SRC> v_zero =
vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0); vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
int x = 0; int x = 0;
for (;;) for (;;)
@ -2385,8 +2385,8 @@ CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[],
const int length, const float scale) const int length, const float scale)
{ {
constexpr int chan = 3; constexpr int chan = 3;
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
constexpr int lanes = chan * nlanes; const int lanes = chan * nlanes;
if (length < lanes) if (length < lanes)
return 0; return 0;
@ -2473,7 +2473,7 @@ DIVRC_SIMD(float, float)
int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
const int width) const int width)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (width < nlanes) if (width < nlanes)
return 0; return 0;
@ -2507,7 +2507,7 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
int split4_simd(const uchar in[], uchar out1[], uchar out2[], int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width) uchar out3[], uchar out4[], const int width)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (width < nlanes) if (width < nlanes)
return 0; return 0;
@ -2543,7 +2543,7 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
int merge3_simd(const T in1[], const T in2[], const T in3[], \ int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width) \ T out[], const int width) \
{ \ { \
constexpr int nlanes = vector_type_of_t<T>::nlanes; \ const int nlanes = VTraits<vector_type_of_t<T>>::vlanes(); \
if (width < nlanes) \ if (width < nlanes) \
return 0; \ return 0; \
\ \
@ -2584,7 +2584,7 @@ MERGE3_SIMD(float)
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width) const uchar in4[], uchar out[], const int width)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
if (width < nlanes) if (width < nlanes)
return 0; return 0;
@ -2618,13 +2618,13 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
template<typename VT> template<typename VT>
CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b) CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b)
{ {
return a + b; return v_add(a, b);
} }
template<typename VT> template<typename VT>
CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b) CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b)
{ {
return a - b; return v_sub(a, b);
} }
CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2) CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2)
@ -2653,7 +2653,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
std::is_same<SRC, ushort>::value, void>::type std::is_same<SRC, ushort>::value, void>::type
arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx) arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
vector_type_of_t<SRC> a1 = vx_load(in1x); vector_type_of_t<SRC> a1 = vx_load(in1x);
vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]); vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]);
@ -2667,7 +2667,7 @@ template<typename oper_tag>
CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x, CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x,
const float* in2x, uchar* outx) const float* in2x, uchar* outx)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 a1 = vx_load(in1x); v_float32 a1 = vx_load(in1x);
v_float32 a2 = vx_load(&in1x[nlanes / 4]); v_float32 a2 = vx_load(&in1x[nlanes / 4]);
@ -2709,7 +2709,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, void>::type std::is_same<DST, ushort>::value, void>::type
arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx) arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 a1 = vx_load(in1x); v_float32 a1 = vx_load(in1x);
v_float32 a2 = vx_load(&in1x[nlanes/2]); v_float32 a2 = vx_load(&in1x[nlanes/2]);
v_float32 b1 = vx_load(in2x); v_float32 b1 = vx_load(in2x);
@ -2761,7 +2761,7 @@ template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[], CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[],
DST out[], const int length) DST out[], const int length)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
if (length < nlanes) if (length < nlanes)
return 0; return 0;
@ -2869,7 +2869,7 @@ CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res)
CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx) CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_int32 a1 = v_round(vx_load(inx)); v_int32 a1 = v_round(vx_load(inx));
v_int32 a2 = v_round(vx_load(&inx[nlanes/4])); v_int32 a2 = v_round(vx_load(&inx[nlanes/4]));
@ -2887,7 +2887,7 @@ CV_ALWAYS_INLINE
typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx) convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
vector_type_of_t<SRC> a1 = vx_load(inx); vector_type_of_t<SRC> a1 = vx_load(inx);
vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]); vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]);
@ -2902,7 +2902,7 @@ CV_ALWAYS_INLINE
typename std::enable_if<DST_SHORT_OR_USHORT, void>::type typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
convertto_simd_nocoeff_impl(const float* inx, DST* outx) convertto_simd_nocoeff_impl(const float* inx, DST* outx)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_int32 a1 = v_round(vx_load(inx)); v_int32 a1 = v_round(vx_load(inx));
v_int32 a2 = v_round(vx_load(&inx[nlanes/2])); v_int32 a2 = v_round(vx_load(&inx[nlanes/2]));
@ -2942,7 +2942,7 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \ #define CONVERTTO_NOCOEF_SIMD(SRC, DST) \
int convertto_simd(const SRC in[], DST out[], const int length) \ int convertto_simd(const SRC in[], DST out[], const int length) \
{ \ { \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \ const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes(); \
if (length < nlanes) \ if (length < nlanes) \
return 0; \ return 0; \
\ \
@ -2982,7 +2982,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx,
const v_float32& v_alpha, const v_float32& v_alpha,
const v_float32& v_beta) const v_float32& v_beta)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_float32 a1 = vx_load(inx); v_float32 a1 = vx_load(inx);
v_float32 a2 = vx_load(&inx[nlanes / 4]); v_float32 a2 = vx_load(&inx[nlanes / 4]);
@ -3003,7 +3003,7 @@ typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha, convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha,
const v_float32& v_beta) const v_float32& v_beta)
{ {
constexpr int nlanes = v_uint8::nlanes; const int nlanes = VTraits<v_uint8>::vlanes();
v_int16 a = v_reinterpret_as_s16(vx_load(inx)); v_int16 a = v_reinterpret_as_s16(vx_load(inx));
v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2])); v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
@ -3050,7 +3050,7 @@ convertto_scaled_simd_impl(const float* inx, DST* outx,
const v_float32& v_alpha, const v_float32& v_alpha,
const v_float32& v_beta) const v_float32& v_beta)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
v_float32 a1 = vx_load(inx); v_float32 a1 = vx_load(inx);
v_float32 a2 = vx_load(&inx[nlanes / 2]); v_float32 a2 = vx_load(&inx[nlanes / 2]);
@ -3111,7 +3111,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx,
int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \ int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length) \ const float beta, const int length) \
{ \ { \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \ const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes(); \
if (length < nlanes) \ if (length < nlanes) \
return 0; \ return 0; \
\ \

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save