Merge pull request #24325 from hanliutong:rewrite

Rewrite Universal Intrinsic code: float related part #24325

The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.

The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module

This patch (hopefully) is the last one in the series. 

This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, 
    then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
    - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
    - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
      - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
      - ./modules/imgproc/src/color_lab.cpp (Array of vector type)
      - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
      - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
      These algorithms will need to be redesigned to accommodate scalable backends.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/24370/head
HAN Liutong 1 year ago committed by GitHub
parent 3dcaf1f287
commit 07bf9cb013
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 74
      modules/calib3d/src/undistort.simd.hpp
  2. 17
      modules/core/include/opencv2/core/hal/intrin.hpp
  3. 81
      modules/core/src/arithm.simd.hpp
  4. 170
      modules/core/src/has_non_zero.simd.hpp
  5. 2
      modules/core/src/lapack.cpp
  6. 1
      modules/core/src/matmul.simd.hpp
  7. 2
      modules/core/src/mean.simd.hpp
  8. 4
      modules/dnn/src/int8layers/convolution_layer.cpp
  9. 2
      modules/dnn/src/int8layers/fully_connected_layer.cpp
  10. 278
      modules/imgproc/src/accum.simd.hpp
  11. 36
      modules/imgproc/src/color_hsv.simd.hpp
  12. 6
      modules/imgproc/src/median_blur.simd.hpp

@ -89,8 +89,8 @@ public:
s2(_s2),
s3(_s3),
s4(_s4) {
#if CV_SIMD_64F
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
{
s_x[i] = ir[0] * i;
s_y[i] = ir[3] * i;
@ -123,26 +123,26 @@ public:
else
CV_Assert(m1 != NULL);
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const v_float64 v_one = vx_setall_f64(1.0);
for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
{
v_float64 m_0, m_1, m_2, m_3;
m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);
v_float64 xd_0 = x_0 * x_0;
v_float64 yd_0 = y_0 * y_0;
v_float64 xd_1 = x_1 * x_1;
v_float64 yd_1 = y_1 * y_1;
v_float64 xd_0 = v_mul(x_0, x_0);
v_float64 yd_0 = v_mul(y_0, y_0);
v_float64 xd_1 = v_mul(x_1, x_1);
v_float64 yd_1 = v_mul(y_1, y_1);
v_float64 r2_0 = xd_0 + yd_0;
v_float64 r2_1 = xd_1 + yd_1;
v_float64 r2_0 = v_add(xd_0, yd_0);
v_float64 r2_1 = v_add(xd_1, yd_1);
m_1 = vx_setall_f64(k3);
m_2 = vx_setall_f64(k2);
@ -151,18 +151,18 @@ public:
m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
m_3 = vx_setall_f64(k6);
m_2 = vx_setall_f64(k5);
m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));
m_3 = vx_setall_f64(2.0);
xd_0 = v_muladd(m_3, xd_0, r2_0);
yd_0 = v_muladd(m_3, yd_0, r2_0);
xd_1 = v_muladd(m_3, xd_1, r2_1);
yd_1 = v_muladd(m_3, yd_1, r2_1);
m_2 = x_0 * y_0 * m_3;
m_3 = x_1 * y_1 * m_3;
m_2 = v_mul(v_mul(x_0, y_0), m_3);
m_3 = v_mul(v_mul(x_1, y_1), m_3);
x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);
m_0 = vx_setall_f64(p1);
m_1 = vx_setall_f64(p2);
@ -176,8 +176,8 @@ public:
xd_1 = v_muladd(m_0, m_3, xd_1);
yd_1 = v_muladd(m_1, m_3, yd_1);
m_0 = r2_0 * r2_0;
m_1 = r2_1 * r2_1;
m_0 = v_mul(r2_0, r2_0);
m_1 = v_mul(r2_1, r2_1);
m_2 = vx_setall_f64(s2);
m_3 = vx_setall_f64(s1);
xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@ -203,17 +203,17 @@ public:
r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
m_0 = vx_setzero_f64();
r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));
m_0 = vx_setall_f64(fx);
m_1 = vx_setall_f64(u0);
m_2 = vx_setall_f64(fy);
m_3 = vx_setall_f64(v0);
x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);
if (m1type == CV_32FC1)
{
@ -225,20 +225,20 @@ public:
v_float32 mf0, mf1;
v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
v_store(&m1f[j * 2], mf0);
v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
}
else // m1type == CV_16SC2
{
m_0 = vx_setall_f64(INTER_TAB_SIZE);
x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);
v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
v_int32 iu = v_round(x_0, x_1);
v_int32 iv = v_round(y_0, y_1);
v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
v_int32 out0, out1;
v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
v_store(&m1[j * 2], v_pack(out0, out1));
}
}
@ -302,10 +302,10 @@ private:
double s2;
double s3;
double s4;
#if CV_SIMD_64F
double s_x[2*v_float64::nlanes];
double s_y[2*v_float64::nlanes];
double s_w[2*v_float64::nlanes];
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
double s_x[2*VTraits<v_float64>::max_nlanes];
double s_y[2*VTraits<v_float64>::max_nlanes];
double s_w[2*VTraits<v_float64>::max_nlanes];
#endif
};
}

@ -972,6 +972,15 @@ namespace CV__SIMD_NAMESPACE {
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
return a == b; \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
return a != b; \
}
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@ -984,11 +993,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
// OPENCV_HAL_WRAP_CMP(v_uint64)
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
// OPENCV_HAL_WRAP_CMP(v_int64)
OPENCV_HAL_WRAP_EQ_OP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
@ -997,9 +1006,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
@ -1009,9 +1020,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)

@ -69,7 +69,7 @@
#define DEFINE_SIMD_F32(fun, ...) \
DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#define DEFINE_SIMD_F64(fun, ...) \
DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
#else
@ -104,7 +104,7 @@ namespace cv { namespace hal {
#ifdef ARITHM_DEFINITIONS_ONLY
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
typedef int v_float64; // dummy
#endif
@ -266,7 +266,7 @@ struct op_absdiff
template<>
struct op_absdiff<schar, v_int8>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int8 r(const v_int8& a, const v_int8& b)
{ return v_absdiffs(a, b); }
#endif
@ -276,7 +276,7 @@ struct op_absdiff<schar, v_int8>
template<>
struct op_absdiff<short, v_int16>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int16 r(const v_int16& a, const v_int16& b)
{ return v_absdiffs(a, b); }
#endif
@ -286,7 +286,7 @@ struct op_absdiff<short, v_int16>
template<>
struct op_absdiff<int, v_int32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int32 r(const v_int32& a, const v_int32& b)
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
#endif
@ -331,7 +331,7 @@ struct op_not
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct bin_loader
@ -396,7 +396,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef bin_loader<OP, T1, Tvec> ldr;
const int wide_step = VTraits<Tvec>::vlanes();
#if !CV_NEON && CV_SIMD_WIDTH == 16
@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
#if !CV_NEON && !CV_MSA
if (is_aligned(src1, src2, dst))
{
@ -464,7 +464,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
vx_cleanup();
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
{
@ -496,7 +496,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
#define BIN_LOOP64F bin_loop_nosimd
#else
#define BIN_LOOP64F bin_loop
#endif //!CV_SIMD_64F
#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -621,7 +621,7 @@ struct op_cmpne
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct cmp_loader_n
@ -701,7 +701,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
#endif // CV_SIMD
@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, dst + x);
@ -768,7 +768,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
}
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template< template<typename T1, typename Tvec> class OP, typename T1>
static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
@ -822,7 +822,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
break;
}
}
#endif // !CV_SIMD_64F
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp)
//////////////////////////// Loaders ///////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
struct scalar_loader_n
@ -1099,16 +1099,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
};
#endif // CV_SIMD
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<template<typename T1, typename T2, typename Tvec> class OP>
struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
{
typedef OP<int, float, v_int32> op;
typedef OP<double, double, v_float64> op64;
enum {step = v_int32::nlanes};
static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src2 = vx_load(src2);
v_int32 v_src1s = vx_load(src1 + step);
@ -1125,6 +1125,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
}
static inline void l(const int* src1, const double* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src1s = vx_load(src1 + step);
@ -1169,10 +1170,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
{
typedef OP<float, float, v_float32> op;
typedef OP<double, double, v_float64> op64;
enum {step = v_float32::nlanes};
static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src2 = vx_load(src2);
v_float32 v_src1s = vx_load(src1 + step);
@ -1186,6 +1187,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
}
static inline void l(const float* src1, const double* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src1s = vx_load(src1 + step);
@ -1226,10 +1228,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
{
typedef OP<double, double, v_float64> op;
enum {step = v_float64::nlanes};
static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
{
const int step = VTraits<v_float64>::vlanes();
v_float64 v_src1 = vx_load(src1);
v_float64 v_src2 = vx_load(src2);
v_float64 v_src1s = vx_load(src1 + step);
@ -1243,6 +1245,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
}
static inline void l(const double* src1, const double* scalar, double* dst)
{
const int step = VTraits<v_float64>::vlanes();
v_float64 v_src1 = vx_load(src1);
v_float64 v_src1s = vx_load(src1 + step);
@ -1253,7 +1256,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
v_store(dst + step, r1);
}
};
#endif // CV_SIMD_64F
#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
//////////////////////////// Loops /////////////////////////////////
@ -1263,7 +1266,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1277,7 +1280,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1309,7 +1312,7 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1322,7 +1325,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, scalar, dst + x);
@ -1349,7 +1352,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
vx_cleanup();
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
// dual source
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@ -1413,7 +1416,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
#define SCALAR_LOOP64F scalar_loop_nosimd
#else
#define SCALAR_LOOP64F scalar_loop
#endif // !CV_SIMD_64F
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -1437,7 +1440,7 @@ struct op_mul
template<typename T1, typename T2, typename Tvec>
struct op_mul_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1453,7 +1456,7 @@ struct op_mul_scale
template<>
struct op_mul_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1578,7 +1581,7 @@ struct op_div_f
template<typename T1, typename T2, typename Tvec>
struct op_div_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1600,7 +1603,7 @@ struct op_div_scale
template<>
struct op_div_scale<float, float, v_float32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1614,7 +1617,7 @@ struct op_div_scale<float, float, v_float32>
template<>
struct op_div_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1686,7 +1689,7 @@ DEFINE_SIMD_ALL(div, div_loop)
template<typename T1, typename T2, typename Tvec>
struct op_add_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1702,7 +1705,7 @@ struct op_add_scale
template<>
struct op_add_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_alpha = vx_setall_f64(*scalar);
@ -1719,7 +1722,7 @@ struct op_add_scale<double, double, v_float64>
template<typename T1, typename T2, typename Tvec>
struct op_add_weighted
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
{
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1737,7 +1740,7 @@ struct op_add_weighted
template<>
struct op_add_weighted<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
{
const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@ -1836,7 +1839,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
template<typename T1, typename T2, typename Tvec>
struct op_recip
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1858,7 +1861,7 @@ struct op_recip
template<>
struct op_recip<float, float, v_float32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1872,7 +1875,7 @@ struct op_recip<float, float, v_float32>
template<>
struct op_recip<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);

@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
{
bool res = false;
const uchar* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_uint8 v_type;
const v_type v_zero = vx_setzero_u8();
constexpr const int unrollCount = 2;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const uchar* srcSimdEnd = src+len0;
@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
res = v_check_any(((v0 | v1) != v_zero));
src += VTraits<v_type>::vlanes();
res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
}
v_cleanup();
@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
{
bool res = false;
const ushort* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_uint16 v_type;
const v_type v_zero = vx_setzero_u16();
constexpr const int unrollCount = 4;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const ushort* srcSimdEnd = src+len0;
@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
res = v_check_any(((v0 | v2) != v_zero));
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
}
v_cleanup();
@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
{
bool res = false;
const int* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_int32 v_type;
const v_type v_zero = vx_setzero_s32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const int* srcSimdEnd = src+len0;
@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
v0 |= v2;
v4 |= v6;
res = v_check_any(((v0 | v4) != v_zero));
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
}
v_cleanup();
@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
{
bool res = false;
const float* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
v0 |= v2;
v4 |= v6;
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
}
v_cleanup();
@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
{
bool res = false;
const double* srcEnd = src+len;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
typedef v_float64 v_type;
const v_type v_zero = vx_setzero_f64();
constexpr const int unrollCount = 16;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const double* srcSimdEnd = src+len0;
@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v8 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v9 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v10 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v11 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v12 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v13 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v14 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v15 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
v8 |= v9;
v10 |= v11;
v12 |= v13;
v14 |= v15;
v0 |= v2;
v4 |= v6;
v8 |= v10;
v12 |= v14;
v0 |= v4;
v8 |= v12;
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v8 = v_or(v8, v9);
v10 = v_or(v10, v11);
v12 = v_or(v12, v13);
v14 = v_or(v14, v15);
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
v8 = v_or(v8, v10);
v12 = v_or(v12, v14);
v0 = v_or(v0, v4);
v8 = v_or(v8, v12);
//res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v8) == v_zero));
res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
}
v_cleanup();

@ -276,7 +276,7 @@ template<typename T> struct VBLAS
int givens(T*, T*, int, T, T) const { return 0; }
};
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
{
if( n < 2*VTraits<v_float32>::vlanes() )

@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len)
double dotProd_32s(const int* src1, const int* src2, int len)
{
#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
double r = .0;
int i = 0;
const int step = VTraits<v_int32>::vlanes();

@ -24,7 +24,7 @@ struct SumSqr_SIMD
}
};
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <>
struct SumSqr_SIMD<uchar, int, int>

@ -19,7 +19,7 @@ namespace cv
namespace dnn
{
#if CV_SIMD
#if CV_SIMD128
static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
{
@ -1015,7 +1015,7 @@ public:
outptr[0] = std::min(std::max(out1, -128), 127);
out_j = 1;
}
#if CV_SIMD
#if CV_SIMD128
if( stride_w == 1 )
{
const int out_delta = 16;

@ -305,7 +305,7 @@ public:
#endif
{
int i = 0;
#if CV_SIMD
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),

@ -475,9 +475,9 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD
const int cVectorWidth = v_uint16::nlanes;
const int step = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int cVectorWidth = VTraits<v_uint16>::vlanes();
const int step = VTraits<v_float32>::vlanes();
if (!mask)
{
@ -493,8 +493,8 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
#else
for (; x <= size - cVectorWidth; x += cVectorWidth)
{
v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
v_store(dst + x, v_add(vx_load(dst + x), vx_load(src + x)));
v_store(dst + x + step, v_add(vx_load(dst + x + step), vx_load(src + x + step)));
}
#endif // CV_AVX && !CV_AVX2
}
@ -508,11 +508,11 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
v_uint16 v_masku16 = vx_load_expand(mask + x);
v_uint32 v_masku320, v_masku321;
v_expand(v_masku16, v_masku320, v_masku321);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
v_store(dst + x, v_add(vx_load(dst + x), v_and(vx_load(src + x), v_mask0)));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(vx_load(src + x + step), v_mask1)));
}
}
else if (cn == 3)
@ -522,25 +522,25 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
v_uint16 v_masku16 = vx_load_expand(mask + x);
v_uint32 v_masku320, v_masku321;
v_expand(v_masku16, v_masku320, v_masku321);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
v_src00 = v_src00 & v_mask0;
v_src01 = v_src01 & v_mask1;
v_src10 = v_src10 & v_mask0;
v_src11 = v_src11 & v_mask1;
v_src20 = v_src20 & v_mask0;
v_src21 = v_src21 & v_mask1;
v_src00 = v_and(v_src00, v_mask0);
v_src01 = v_and(v_src01, v_mask1);
v_src10 = v_and(v_src10, v_mask0);
v_src11 = v_and(v_src11, v_mask1);
v_src20 = v_and(v_src20, v_mask0);
v_src21 = v_and(v_src21, v_mask1);
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
}
}
}
@ -862,9 +862,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float32::nlanes;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float32>::vlanes();
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -889,8 +889,8 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
v_float64 v_src0 = v_cvt_f64(v_src);
v_float64 v_src1 = v_cvt_f64_high(v_src);
v_store(dst + x, vx_load(dst + x) + v_src0);
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
v_store(dst + x, v_add(vx_load(dst + x), v_src0));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
}
#endif // CV_AVX && !CV_AVX2
}
@ -904,15 +904,15 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float32 v_src = vx_load(src + x);
v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
v_float64 v_src0 = v_and(v_cvt_f64(v_src), v_mask0);
v_float64 v_src1 = v_and(v_cvt_f64_high(v_src), v_mask1);
v_store(dst + x, vx_load(dst + x) + v_src0);
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
v_store(dst + x, v_add(vx_load(dst + x), v_src0));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
}
}
else if (cn == 3)
@ -922,24 +922,24 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float32 v_src0, v_src1, v_src2;
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
v_float64 v_src00 = v_and(v_cvt_f64(v_src0), v_mask0);
v_float64 v_src01 = v_and(v_cvt_f64_high(v_src0), v_mask1);
v_float64 v_src10 = v_and(v_cvt_f64(v_src1), v_mask0);
v_float64 v_src11 = v_and(v_cvt_f64_high(v_src1), v_mask1);
v_float64 v_src20 = v_and(v_cvt_f64(v_src2), v_mask0);
v_float64 v_src21 = v_and(v_cvt_f64_high(v_src2), v_mask1);
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
}
}
}
@ -950,9 +950,9 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float64::nlanes * 2;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -971,8 +971,8 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
v_float64 v_src0 = vx_load(src + x);
v_float64 v_src1 = vx_load(src + x + step);
v_store(dst + x, vx_load(dst + x) + v_src0);
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
v_store(dst + x, v_add(vx_load(dst + x), v_src0));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
}
#endif // CV_AVX && !CV_AVX2
}
@ -986,14 +986,14 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_src0 = vx_load(src + x);
v_float64 v_src1 = vx_load(src + x + step);
v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
v_store(dst + x, v_add(vx_load(dst + x), v_and(v_src0, v_mask0)));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_src1, v_mask1)));
}
}
else if (cn == 3)
@ -1003,25 +1003,25 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
v_src00 = v_src00 & v_mask0;
v_src01 = v_src01 & v_mask1;
v_src10 = v_src10 & v_mask0;
v_src11 = v_src11 & v_mask1;
v_src20 = v_src20 & v_mask0;
v_src21 = v_src21 & v_mask1;
v_src00 = v_and(v_src00, v_mask0);
v_src01 = v_and(v_src01, v_mask1);
v_src10 = v_and(v_src10, v_mask0);
v_src11 = v_and(v_src11, v_mask1);
v_src20 = v_and(v_src20, v_mask0);
v_src21 = v_and(v_src21, v_mask1);
v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
}
}
}
@ -1256,9 +1256,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD
const int cVectorWidth = v_uint16::nlanes;
const int step = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int cVectorWidth = VTraits<v_uint16>::vlanes();
const int step = VTraits<v_float32>::vlanes();
if (!mask)
{
@ -1293,12 +1293,12 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
v_uint16 v_mask16 = vx_load_expand(mask + x);
v_uint32 v_mask_0, v_mask_1;
v_expand(v_mask16, v_mask_0, v_mask_1);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
v_float32 v_src0 = vx_load(src + x);
v_float32 v_src1 = vx_load(src + x + step);
v_src0 = v_src0 & v_mask0;
v_src1 = v_src1 & v_mask1;
v_src0 = v_and(v_src0, v_mask0);
v_src1 = v_and(v_src1, v_mask1);
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
@ -1311,18 +1311,18 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
v_uint16 v_mask16 = vx_load_expand(mask + x);
v_uint32 v_mask_0, v_mask_1;
v_expand(v_mask16, v_mask_0, v_mask_1);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
v_src00 = v_src00 & v_mask0;
v_src01 = v_src01 & v_mask1;
v_src10 = v_src10 & v_mask0;
v_src11 = v_src11 & v_mask1;
v_src20 = v_src20 & v_mask0;
v_src21 = v_src21 & v_mask1;
v_src00 = v_and(v_src00, v_mask0);
v_src01 = v_and(v_src01, v_mask1);
v_src10 = v_and(v_src10, v_mask0);
v_src11 = v_and(v_src11, v_mask1);
v_src20 = v_and(v_src20, v_mask0);
v_src21 = v_and(v_src21, v_mask1);
v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -1625,9 +1625,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float32::nlanes;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float32>::vlanes();
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -1667,9 +1667,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
v_uint32 v_mask = vx_load_expand_q(mask + x);;
v_mask = ~(v_mask == v_0);
v_mask = v_not(v_eq(v_mask, v_0));
v_float32 v_src = vx_load(src + x);
v_src = v_src & v_reinterpret_as_f32(v_mask);
v_src = v_and(v_src, v_reinterpret_as_f32(v_mask));
v_float64 v_src0 = v_cvt_f64(v_src);
v_float64 v_src1 = v_cvt_f64_high(v_src);
@ -1682,13 +1682,13 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
v_uint32 v_mask = vx_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_mask = v_not(v_eq(v_mask, v_0));
v_float32 v_src0, v_src1, v_src2;
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
v_src0 = v_and(v_src0, v_reinterpret_as_f32(v_mask));
v_src1 = v_and(v_src1, v_reinterpret_as_f32(v_mask));
v_src2 = v_and(v_src2, v_reinterpret_as_f32(v_mask));
v_float64 v_src00 = v_cvt_f64(v_src0);
v_float64 v_src01 = v_cvt_f64_high(v_src0);
@ -1720,9 +1720,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float64::nlanes * 2;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -1756,12 +1756,12 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_mask32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_src0 = vx_load(src + x);
v_float64 v_src1 = vx_load(src + x + step);
v_src0 = v_src0 & v_mask0;
v_src1 = v_src1 & v_mask1;
v_src0 = v_and(v_src0, v_mask0);
v_src1 = v_and(v_src1, v_mask1);
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
}
@ -1773,18 +1773,18 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_mask32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
v_src00 = v_src00 & v_mask0;
v_src01 = v_src01 & v_mask1;
v_src10 = v_src10 & v_mask0;
v_src11 = v_src11 & v_mask1;
v_src20 = v_src20 & v_mask0;
v_src21 = v_src21 & v_mask1;
v_src00 = v_and(v_src00, v_mask0);
v_src01 = v_and(v_src01, v_mask1);
v_src10 = v_and(v_src10, v_mask0);
v_src11 = v_and(v_src11, v_mask1);
v_src20 = v_and(v_src20, v_mask0);
v_src21 = v_and(v_src21, v_mask1);
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -2035,9 +2035,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD
const int cVectorWidth = v_uint16::nlanes;
const int step = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int cVectorWidth = VTraits<v_uint16>::vlanes();
const int step = VTraits<v_float32>::vlanes();
if (!mask)
{
@ -2069,11 +2069,11 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(vx_load(src1 + x), vx_load(src2 + x)), v_mask0)));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(vx_load(src1 + x + step), vx_load(src2 + x + step)), v_mask1)));
}
}
else if (cn == 3)
@ -2082,8 +2082,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@ -2096,8 +2096,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
v_store_interleave(dst + x * cn, v_add(v_dst00, v_and(v_mul(v_1src00, v_2src00), v_mask0)), v_add(v_dst10, v_and(v_mul(v_1src10, v_2src10), v_mask0)), v_add(v_dst20, v_and(v_mul(v_1src20, v_2src20), v_mask0)));
v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_and(v_mul(v_1src01, v_2src01), v_mask1)), v_add(v_dst11, v_and(v_mul(v_1src11, v_2src11), v_mask1)), v_add(v_dst21, v_and(v_mul(v_1src21, v_2src21), v_mask1)));
}
}
}
@ -2398,9 +2398,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float32::nlanes;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float32>::vlanes();
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -2447,11 +2447,11 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
v_uint32 v_mask = vx_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_mask = v_not(v_eq(v_mask, v_0));
v_float32 v_1src = vx_load(src1 + x);
v_float32 v_2src = vx_load(src2 + x);
v_1src = v_1src & v_reinterpret_as_f32(v_mask);
v_2src = v_2src & v_reinterpret_as_f32(v_mask);
v_1src = v_and(v_1src, v_reinterpret_as_f32(v_mask));
v_2src = v_and(v_2src, v_reinterpret_as_f32(v_mask));
v_float64 v_1src0 = v_cvt_f64(v_1src);
v_float64 v_1src1 = v_cvt_f64_high(v_1src);
@ -2467,16 +2467,16 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
v_uint32 v_mask = vx_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_mask = v_not(v_eq(v_mask, v_0));
v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
v_1src0 = v_and(v_1src0, v_reinterpret_as_f32(v_mask));
v_1src1 = v_and(v_1src1, v_reinterpret_as_f32(v_mask));
v_1src2 = v_and(v_1src2, v_reinterpret_as_f32(v_mask));
v_2src0 = v_and(v_2src0, v_reinterpret_as_f32(v_mask));
v_2src1 = v_and(v_2src1, v_reinterpret_as_f32(v_mask));
v_2src2 = v_and(v_2src2, v_reinterpret_as_f32(v_mask));
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -2501,9 +2501,9 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
{
int x = 0;
#if CV_SIMD_64F
const int cVectorWidth = v_float64::nlanes * 2;
const int step = v_float64::nlanes;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
const int step = VTraits<v_float64>::vlanes();
if (!mask)
{
@ -2542,16 +2542,16 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_mask32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_src00 = vx_load(src1 + x);
v_float64 v_src01 = vx_load(src1 + x + step);
v_float64 v_src10 = vx_load(src2 + x);
v_float64 v_src11 = vx_load(src2 + x + step);
v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(v_src00, v_src10), v_mask0)));
v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(v_src01, v_src11), v_mask1)));
}
}
else if (cn == 3)
@ -2561,8 +2561,8 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
v_uint64 v_masku640, v_masku641;
v_expand(v_mask32, v_masku640, v_masku641);
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@ -2570,19 +2570,19 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
v_float64 v_src00 = v_mul(v_and(v_1src00, v_mask0), v_2src00);
v_float64 v_src01 = v_mul(v_and(v_1src01, v_mask1), v_2src01);
v_float64 v_src10 = v_mul(v_and(v_1src10, v_mask0), v_2src10);
v_float64 v_src11 = v_mul(v_and(v_1src11, v_mask1), v_2src11);
v_float64 v_src20 = v_mul(v_and(v_1src20, v_mask0), v_2src20);
v_float64 v_src21 = v_mul(v_and(v_1src21, v_mask1), v_2src21);
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
}
}
}

@ -98,7 +98,7 @@ struct RGB2HSV_b
int i = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_uint8>::vlanes();
for ( ; i <= n - vsize;
i += vsize, src += scn*vsize, dst += 3*vsize)
@ -274,7 +274,7 @@ struct RGB2HSV_f
: srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
{ }
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
inline void process(const v_float32& v_r, const v_float32& v_g, const v_float32& v_b,
v_float32& v_h, v_float32& v_s, v_float32& v_v,
float hscale) const
@ -308,7 +308,7 @@ struct RGB2HSV_f
float hscale = hrange*(1.f/360.f);
n *= 3;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_float32>::vlanes();
for ( ; i <= n - 3*vsize; i += 3*vsize, src += scn * vsize)
{
@ -368,7 +368,7 @@ struct RGB2HSV_f
};
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
inline void HSV2RGB_simd(const v_float32& h, const v_float32& s, const v_float32& v,
v_float32& b, v_float32& g, v_float32& r, float hscale)
{
@ -473,7 +473,7 @@ struct HSV2RGB_f
float hs = hscale;
n *= 3;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_float32>::vlanes();
v_float32 valpha = vx_setall_f32(alpha);
for (; i <= n - vsize*3; i += vsize*3, dst += dcn * vsize)
@ -530,7 +530,7 @@ struct HSV2RGB_b
int j = 0, dcn = dstcn;
uchar alpha = ColorChannel<uchar>::max();
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_float32>::vlanes();
for (j = 0; j <= (n - vsize*4) * 3; j += 3 * 4 * vsize, dst += dcn * 4 * vsize)
@ -679,7 +679,7 @@ struct RGB2HLS_f
{
}
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
inline void process(const v_float32& r, const v_float32& g, const v_float32& b,
const v_float32& vhscale,
v_float32& h, v_float32& l, v_float32& s) const
@ -718,7 +718,7 @@ struct RGB2HLS_f
int i = 0, bidx = blueIdx, scn = srccn;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_float32>::vlanes();
v_float32 vhscale = vx_setall_f32(hscale);
@ -802,13 +802,13 @@ struct RGB2HLS_b
int scn = srccn;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
#else
float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
#endif
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static const int fsize = VTraits<v_float32>::vlanes();
//TODO: fix that when v_interleave is available
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@ -823,7 +823,7 @@ struct RGB2HLS_b
{
int dn = std::min(n - i, (int)BLOCK_SIZE);
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 v255inv = vx_setall_f32(1.f/255.f);
if (scn == 3)
{
@ -902,7 +902,7 @@ struct RGB2HLS_b
cvt(buf, buf, dn);
int j = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4)
{
v_float32 f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11;
@ -973,7 +973,7 @@ struct HLS2RGB_f
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange)
{ }
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
inline void process(const v_float32& h, const v_float32& l, const v_float32& s,
v_float32& b, v_float32& g, v_float32& r) const
{
@ -1016,7 +1016,7 @@ struct HLS2RGB_f
int i = 0, bidx = blueIdx, dcn = dstcn;
float alpha = ColorChannel<float>::max();
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static const int vsize = VTraits<v_float32>::vlanes();
for (; i <= n - vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
{
@ -1099,13 +1099,13 @@ struct HLS2RGB_b
int i, j, dcn = dstcn;
uchar alpha = ColorChannel<uchar>::max();
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
#else
float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
#endif
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static const int fsize = VTraits<v_float32>::vlanes();
//TODO: fix that when v_interleave is available
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@ -1122,7 +1122,7 @@ struct HLS2RGB_b
int dn = std::min(n - i, (int)BLOCK_SIZE);
j = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; j <= dn*3 - 3*4*fsize; j += 3*4*fsize)
{
// 3x uchar -> 3*4 float
@ -1179,7 +1179,7 @@ struct HLS2RGB_b
}
cvt(buf, buf, dn);
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 v255 = vx_setall_f32(255.f);
if(dcn == 3)
{

@ -548,7 +548,7 @@ struct MinMax32f
}
};
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
struct MinMaxVec8u
{
@ -688,7 +688,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
if( limit == size.width )
break;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
#else
int nlanes = 1;
@ -793,7 +793,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
if( limit == size.width )
break;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
#else
int nlanes = 1;

Loading…
Cancel
Save