From a287605c3e690aebd92080576d52f875cdb01242 Mon Sep 17 00:00:00 2001 From: Liutong HAN Date: Fri, 13 Oct 2023 19:23:30 +0800 Subject: [PATCH] Clean up the Universal Intrinsic API. --- .../core/include/opencv2/core/hal/intrin.hpp | 70 ++- .../include/opencv2/core/hal/intrin_neon.hpp | 317 ++++++------ modules/core/src/convert.hpp | 18 +- modules/core/src/mathfuncs_core.simd.hpp | 122 ++--- modules/core/src/matmul.simd.hpp | 30 +- modules/core/src/matrix_transform.cpp | 36 +- modules/core/src/minmax.cpp | 270 +++++----- modules/core/test/test_intrin_utils.hpp | 5 - .../dnn/src/int8layers/convolution_layer.cpp | 20 +- .../src/int8layers/fully_connected_layer.cpp | 4 +- modules/dnn/src/int8layers/pooling_layer.cpp | 16 +- .../src/layers/cpu_kernels/conv_depthwise.cpp | 28 +- .../layers/cpu_kernels/conv_winograd_f63.cpp | 176 +++---- modules/dnn/src/layers/elementwise_layers.cpp | 24 +- .../dnn/src/layers/fully_connected_layer.cpp | 2 +- modules/dnn/src/layers/pooling_layer.cpp | 22 +- modules/features2d/src/fast.cpp | 36 +- modules/features2d/src/fast_score.cpp | 6 +- .../fluid/gfluidimgproc_func.simd.hpp | 26 +- .../fluid/gfluidimgproc_simd_avx2.hpp | 2 +- modules/imgproc/src/bilateral_filter.simd.hpp | 34 +- modules/imgproc/src/box_filter.simd.hpp | 64 +-- modules/imgproc/src/color_lab.cpp | 299 +++++------ modules/imgproc/src/color_rgb.simd.hpp | 12 +- modules/imgproc/src/demosaicing.cpp | 294 +++++------ modules/imgproc/src/filter.simd.hpp | 38 +- modules/imgproc/src/histogram.cpp | 8 +- modules/imgproc/src/imgwarp.cpp | 272 +++++----- modules/imgproc/src/median_blur.simd.hpp | 42 +- modules/imgproc/src/moments.cpp | 20 +- modules/imgproc/src/pyramids.cpp | 14 +- modules/imgproc/src/resize.cpp | 34 +- modules/imgproc/src/sumpixels.simd.hpp | 468 +++++++++--------- modules/objdetect/src/hog.cpp | 92 ++-- modules/video/src/dis_flow.cpp | 40 +- modules/video/src/lkpyramid.cpp | 48 +- modules/video/src/optflowgf.cpp | 28 +- modules/video/src/variational_refinement.cpp | 101 ++-- samples/cpp/simd_basic.cpp | 4 +- .../core/univ_intrin/univ_intrin.cpp | 8 +- 40 files changed, 1615 insertions(+), 1535 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index bf9a247054..904b05e405 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -723,7 +723,7 @@ namespace CV__SIMD_NAMESPACE { /** @brief SIMD processing state cleanup call */ inline void vx_cleanup() { VXPREFIX(_cleanup)(); } -#if !CV_SIMD_SCALABLE +#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP)) // Compatibility layer template struct VTraits { @@ -1148,6 +1148,74 @@ namespace CV__SIMD_NAMESPACE { #endif //!CV_SIMD_SCALABLE +#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP) +// Compatibility layer for the backend that cleaned up. + #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ + template \ + inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ + return v_add(v_add(f1, f2), vf...); \ + } + + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) + #endif + + #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ + template \ + inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ + return v_mul(v_mul(f1, f2), vf...); \ + } + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) + #endif + + #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ + inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ + { \ + return v_extract_n::nlanes-1>(v); \ + } + + OPENCV_HAL_WRAP_EXTRACT(v_uint8) + OPENCV_HAL_WRAP_EXTRACT(v_int8) + OPENCV_HAL_WRAP_EXTRACT(v_uint16) + OPENCV_HAL_WRAP_EXTRACT(v_int16) + OPENCV_HAL_WRAP_EXTRACT(v_uint32) + OPENCV_HAL_WRAP_EXTRACT(v_int32) + OPENCV_HAL_WRAP_EXTRACT(v_uint64) + OPENCV_HAL_WRAP_EXTRACT(v_int64) + OPENCV_HAL_WRAP_EXTRACT(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_EXTRACT(v_float64) + #endif + + #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ + inline _Tpvec v_broadcast_highest(const _Tpvec& v) \ + { \ + return v_broadcast_element::nlanes-1>(v); \ + } + + OPENCV_HAL_WRAP_BROADCAST(v_uint32) + OPENCV_HAL_WRAP_BROADCAST(v_int32) + OPENCV_HAL_WRAP_BROADCAST(v_float32) + +#endif //CV_NEON + //! @cond IGNORED // backward compatibility diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 14eb180819..ee9934135a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -131,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64) OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64) #endif +//////////// Compatibility layer //////////// +template struct VTraits { + static inline int vlanes() { return T::nlanes; } + enum { max_nlanes = T::nlanes, nlanes = T::nlanes }; + using lane_type = typename T::lane_type; +}; + +template +inline typename VTraits::lane_type v_get0(const T& v) \ +{ \ + return v.get0(); \ +} //////////// Types //////////// struct v_uint8x16 { - typedef uchar lane_type; - enum { nlanes = 16 }; - v_uint8x16() {} explicit v_uint8x16(uint8x16_t v) : val(v) {} v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, @@ -146,19 +155,22 @@ struct v_uint8x16 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; val = vld1q_u8(v); } + uint8x16_t val; + +private: + friend struct VTraits; + enum { nlanes = 16 }; + typedef uchar lane_type; + + friend typename VTraits::lane_type v_get0(const v_uint8x16& v); uchar get0() const { return vgetq_lane_u8(val, 0); } - - uint8x16_t val; }; struct v_int8x16 { - typedef schar lane_type; - enum { nlanes = 16 }; - v_int8x16() {} explicit v_int8x16(int8x16_t v) : val(v) {} v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, @@ -167,19 +179,22 @@ struct v_int8x16 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; val = vld1q_s8(v); } + int8x16_t val; + +private: + friend struct VTraits; + enum { nlanes = 16 }; + typedef schar lane_type; + + friend typename VTraits::lane_type v_get0(const v_int8x16& v); schar get0() const { return vgetq_lane_s8(val, 0); } - - int8x16_t val; }; struct v_uint16x8 { - typedef ushort lane_type; - enum { nlanes = 8 }; - v_uint16x8() {} explicit v_uint16x8(uint16x8_t v) : val(v) {} v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) @@ -187,19 +202,22 @@ struct v_uint16x8 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; val = vld1q_u16(v); } + uint16x8_t val; + +private: + friend struct VTraits; + enum { nlanes = 8 }; + typedef ushort lane_type; + + friend typename VTraits::lane_type v_get0(const v_uint16x8& v); ushort get0() const { return vgetq_lane_u16(val, 0); } - - uint16x8_t val; }; struct v_int16x8 { - typedef short lane_type; - enum { nlanes = 8 }; - v_int16x8() {} explicit v_int16x8(int16x8_t v) : val(v) {} v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) @@ -207,19 +225,22 @@ struct v_int16x8 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; val = vld1q_s16(v); } + int16x8_t val; + +private: + friend struct VTraits; + enum { nlanes = 8 }; + typedef short lane_type; + + friend typename VTraits::lane_type v_get0(const v_int16x8& v); short get0() const { return vgetq_lane_s16(val, 0); } - - int16x8_t val; }; struct v_uint32x4 { - typedef unsigned lane_type; - enum { nlanes = 4 }; - v_uint32x4() {} explicit v_uint32x4(uint32x4_t v) : val(v) {} v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) @@ -227,19 +248,22 @@ struct v_uint32x4 unsigned v[] = {v0, v1, v2, v3}; val = vld1q_u32(v); } + uint32x4_t val; + +private: + friend struct VTraits; + enum { nlanes = 4 }; + typedef unsigned lane_type; + + friend typename VTraits::lane_type v_get0(const v_uint32x4& v); unsigned get0() const { return vgetq_lane_u32(val, 0); } - - uint32x4_t val; }; struct v_int32x4 { - typedef int lane_type; - enum { nlanes = 4 }; - v_int32x4() {} explicit v_int32x4(int32x4_t v) : val(v) {} v_int32x4(int v0, int v1, int v2, int v3) @@ -247,18 +271,22 @@ struct v_int32x4 int v[] = {v0, v1, v2, v3}; val = vld1q_s32(v); } + int32x4_t val; + +private: + friend struct VTraits; + enum { nlanes = 4 }; + typedef int lane_type; + + friend typename VTraits::lane_type v_get0(const v_int32x4& v); int get0() const { return vgetq_lane_s32(val, 0); } - int32x4_t val; }; struct v_float32x4 { - typedef float lane_type; - enum { nlanes = 4 }; - v_float32x4() {} explicit v_float32x4(float32x4_t v) : val(v) {} v_float32x4(float v0, float v1, float v2, float v3) @@ -266,18 +294,22 @@ struct v_float32x4 float v[] = {v0, v1, v2, v3}; val = vld1q_f32(v); } + float32x4_t val; + +private: + friend struct VTraits; + enum { nlanes = 4 }; + typedef float lane_type; + + friend typename VTraits::lane_type v_get0(const v_float32x4& v); float get0() const { return vgetq_lane_f32(val, 0); } - float32x4_t val; }; struct v_uint64x2 { - typedef uint64 lane_type; - enum { nlanes = 2 }; - v_uint64x2() {} explicit v_uint64x2(uint64x2_t v) : val(v) {} v_uint64x2(uint64 v0, uint64 v1) @@ -285,18 +317,21 @@ struct v_uint64x2 uint64 v[] = {v0, v1}; val = vld1q_u64(v); } + uint64x2_t val; +private: + friend struct VTraits; + enum { nlanes = 2 }; + typedef uint64 lane_type; + + friend typename VTraits::lane_type v_get0(const v_uint64x2& v); uint64 get0() const { return vgetq_lane_u64(val, 0); } - uint64x2_t val; }; struct v_int64x2 { - typedef int64 lane_type; - enum { nlanes = 2 }; - v_int64x2() {} explicit v_int64x2(int64x2_t v) : val(v) {} v_int64x2(int64 v0, int64 v1) @@ -304,19 +339,23 @@ struct v_int64x2 int64 v[] = {v0, v1}; val = vld1q_s64(v); } + int64x2_t val; + +private: + friend struct VTraits; + enum { nlanes = 2 }; + typedef int64 lane_type; + + friend typename VTraits::lane_type v_get0(const v_int64x2& v); int64 get0() const { return vgetq_lane_s64(val, 0); } - int64x2_t val; }; #if CV_SIMD128_64F struct v_float64x2 { - typedef double lane_type; - enum { nlanes = 2 }; - v_float64x2() {} explicit v_float64x2(float64x2_t v) : val(v) {} v_float64x2(double v0, double v1) @@ -324,11 +363,18 @@ struct v_float64x2 double v[] = {v0, v1}; val = vld1q_f64(v); } + + float64x2_t val; +private: + friend struct VTraits; + enum { nlanes = 2 }; + typedef double lane_type; + + friend typename VTraits::lane_type v_get0(const v_float64x2& v); double get0() const { return vgetq_lane_f64(val, 0); } - float64x2_t val; }; #endif @@ -460,71 +506,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ -} - -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64) +} + +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64) #if CV_SIMD128_64F -OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64) -OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64) +OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64) #else -inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b) { float32x4_t reciprocal = vrecpeq_f32(b.val); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); return v_float32x4(vmulq_f32(a.val, reciprocal)); } -inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) -{ - float32x4_t reciprocal = vrecpeq_f32(b.val); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - a.val = vmulq_f32(a.val, reciprocal); - return a; -} #endif // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8) @@ -698,7 +729,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) { - return v_dotprod_expand(a, b) + c; + return v_add(v_dotprod_expand(a, b), c); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) @@ -715,7 +746,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { - return v_dotprod_expand(a, b) + c; + return v_add(v_dotprod_expand(a, b), c); } #endif // 16 >> 64 @@ -735,7 +766,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) return v_uint64x2(vaddq_u64(s0, s1)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -752,7 +783,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f #if CV_SIMD128_64F @@ -760,7 +791,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } #endif //////// Fast Dot Product //////// @@ -850,7 +881,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b } inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) { - return v_dotprod_expand_fast(a, b) + c; + return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) @@ -861,7 +892,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { - return v_dotprod_expand_fast(a, b) + c; + return v_add(v_dotprod_expand_fast(a, b), c); } #endif @@ -875,7 +906,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b return v_uint64x2(vaddq_u64(s0, s1)); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) { @@ -884,22 +915,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod))); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f #if CV_SIMD128_64F inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod_fast(a, b)); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } #endif #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \ + inline _Tpvec v_not (const _Tpvec& a) \ { \ return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \ } @@ -914,21 +945,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64) OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64) #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \ { \ return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32) -inline v_float32x4 operator ~ (const v_float32x4& a) +inline v_float32x4 v_not (const v_float32x4& a) { return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); } @@ -942,7 +968,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x) inline v_float32x4 v_invsqrt(const v_float32x4& x) { v_float32x4 one = v_setall_f32(1.0f); - return one / v_sqrt(x); + return v_div(one, v_sqrt(x)); } #else inline v_float32x4 v_sqrt(const v_float32x4& x) @@ -975,21 +1001,16 @@ inline v_float32x4 v_abs(v_float32x4 x) #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \ -inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \ { \ return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \ -} \ -inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ -{ \ - a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64) -OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64) -OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64) +OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64) +OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64) +OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64) -inline v_float64x2 operator ~ (const v_float64x2& a) +inline v_float64x2 v_not (const v_float64x2& a) { return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val)))); } @@ -1002,7 +1023,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x) inline v_float64x2 v_invsqrt(const v_float64x2& x) { v_float64x2 one = v_setall_f64(1.0f); - return one / v_sqrt(x); + return v_div(one, v_sqrt(x)); } inline v_float64x2 v_abs(v_float64x2 x) @@ -1037,17 +1058,17 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64) #endif #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) @@ -1065,22 +1086,22 @@ static inline uint64x2_t vmvnq_u64(uint64x2_t a) } //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64) //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64) -static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b) +static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b) { return v_uint64x2(vceqq_u64(a.val, b.val)); } -static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b) +static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b) { return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); } -static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b) +static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b) { return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); } -static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b) +static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b) { return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); } #else -static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b) +static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b) { uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val)); uint32x4_t swapped = vrev64q_u32(cmp); return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped))); } -static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b) +static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b) { uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val)); uint32x4_t swapped = vrev64q_u32(cmp); @@ -1088,13 +1109,13 @@ static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b) uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF)); return v_uint64x2(veorq_u64(v_eq, vx)); } -static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b) +static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b) { - return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b)); + return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } -static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b) +static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b) { - return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b)); + return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } #endif #if CV_SIMD128_64F @@ -1207,9 +1228,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_ // trade efficiency for convenience #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ +inline _Tpvec v_shl (const _Tpvec& a, int n) \ { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ +inline _Tpvec v_shr (const _Tpvec& a, int n) \ { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \ @@ -1231,13 +1252,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) template inline _Tpvec v_rotate_right(const _Tpvec& a) \ { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \ template inline _Tpvec v_rotate_left(const _Tpvec& a) \ -{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \ +{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ { return a; } \ template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \ template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \ +{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ { CV_UNUSED(b); return a; } diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp index c689276218..65a998bd8f 100644 --- a/modules/core/src/convert.hpp +++ b/modules/core/src/convert.hpp @@ -358,8 +358,8 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f static inline void vx_load_as(const double* ptr, v_float32& a) { - const int VECSZ = v_float32::nlanes; - float buf[VECSZ*2]; + const int VECSZ = VTraits::vlanes(); + float buf[VTraits::max_nlanes*2]; for( int i = 0; i < VECSZ; i++ ) buf[i] = saturate_cast(ptr[i]); @@ -369,19 +369,19 @@ static inline void vx_load_as(const double* ptr, v_float32& a) template static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b) { - const int VECSZ = _Tdvec::nlanes; - typename _Tdvec::lane_type buf[VECSZ*2]; + const int VECSZ = VTraits<_Tdvec>::vlanes(); + typename VTraits<_Tdvec>::lane_type buf[VTraits<_Tdvec>::max_nlanes*2]; for( int i = 0; i < VECSZ*2; i++ ) - buf[i] = saturate_cast(ptr[i]); + buf[i] = saturate_cast::lane_type>(ptr[i]); a = vx_load(buf); b = vx_load(buf + VECSZ); } static inline void v_store_as(double* ptr, const v_float32& a) { - const int VECSZ = v_float32::nlanes; - float buf[VECSZ]; + const int VECSZ = VTraits::vlanes(); + float buf[VTraits::max_nlanes]; v_store(buf, a); for( int i = 0; i < VECSZ; i++ ) @@ -391,8 +391,8 @@ static inline void v_store_as(double* ptr, const v_float32& a) template static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b) { - const int VECSZ = _Tsvec::nlanes; - typename _Tsvec::lane_type buf[VECSZ*2]; + const int VECSZ = VTraits<_Tsvec>::vlanes(); + typename VTraits<_Tsvec>::lane_type buf[VTraits<_Tsvec>::max_nlanes*2]; v_store(buf, a); v_store(buf + VECSZ, b); for( int i = 0; i < VECSZ*2; i++ ) diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index 1bf36bb174..2aa107b9be 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -93,13 +93,13 @@ struct v_atan_f32 { v_float32 ax = v_abs(x); v_float32 ay = v_abs(y); - v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps); - v_float32 cc = c * c; - v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c; - a = v_select(ax >= ay, a, val90 - a); - a = v_select(x < z, val180 - a, a); - a = v_select(y < z, val360 - a, a); - return a * s; + v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps)); + v_float32 cc = v_mul(c, c); + v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c); + a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a)); + a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a); + a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a); + return v_mul(a, this->s); } v_float32 eps; @@ -125,7 +125,7 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, float scale = angleInDegrees ? 1.f : (float)(CV_PI/180); int i = 0; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); v_atan_f32 v(scale); for( ; i < len; i += VECSZ*2 ) @@ -198,7 +198,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) int i = 0; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) @@ -209,8 +209,8 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) } v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ); v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ); - x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); - x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); + x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0))); + x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1))); v_store(mag + i, x0); v_store(mag + i + VECSZ, x1); } @@ -231,7 +231,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) int i = 0; #if CV_SIMD_64F - const int VECSZ = v_float64::nlanes; + const int VECSZ = VTraits::vlanes(); for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) @@ -242,8 +242,8 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) } v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ); v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ); - x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); - x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); + x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0))); + x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1))); v_store(mag + i, x0); v_store(mag + i + VECSZ, x1); } @@ -265,7 +265,7 @@ void invSqrt32f(const float* src, float* dst, int len) int i = 0; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) @@ -293,7 +293,7 @@ void invSqrt64f(const double* src, double* dst, int len) int i = 0; #if CV_SIMD_64F - const int VECSZ = v_float64::nlanes; + const int VECSZ = VTraits::vlanes(); for ( ; i < len; i += VECSZ*2) { if( i + VECSZ*2 > len ) @@ -321,7 +321,7 @@ void sqrt32f(const float* src, float* dst, int len) int i = 0; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) @@ -350,7 +350,7 @@ void sqrt64f(const double* src, double* dst, int len) int i = 0; #if CV_SIMD_64F - const int VECSZ = v_float64::nlanes; + const int VECSZ = VTraits::vlanes(); for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) @@ -452,7 +452,7 @@ void exp32f( const float *_x, float *y, int n ) float postscale = (float)exp_postscale; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); const v_float32 vprescale = vx_setall_f32((float)exp_prescale); const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); const v_float32 vminval = vx_setall_f32(minval); @@ -481,26 +481,26 @@ void exp32f( const float *_x, float *y, int n ) xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); - xf0 *= vprescale; - xf1 *= vprescale; + xf0 = v_mul(xf0, vprescale); + xf1 = v_mul(xf1, vprescale); v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); - xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale; - xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale; + xf0 = v_mul(v_sub(xf0, v_cvt_f32(xi0)), vpostscale); + xf1 = v_mul(v_sub(xf1, v_cvt_f32(xi1)), vpostscale); - v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask); - v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask); + v_float32 yf0 = v_lut(expTab_f, v_and(xi0, vidxmask)); + v_float32 yf1 = v_lut(expTab_f, v_and(xi1, vidxmask)); v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255); - xi0 = v_min(v_max(v_shr(xi0) + v127, v0), v255); - xi1 = v_min(v_max(v_shr(xi1) + v127, v0), v255); + xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v127), v0), v255); + xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v127), v0), v255); - yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0)); - yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1)); + yf0 = v_mul(yf0, v_reinterpret_as_f32(v_shl<23>(xi0))); + yf1 = v_mul(yf1, v_reinterpret_as_f32(v_shl<23>(xi1))); - v_float32 zf0 = xf0 + vA1; - v_float32 zf1 = xf1 + vA1; + v_float32 zf0 = v_add(xf0, vA1); + v_float32 zf1 = v_add(xf1, vA1); zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); @@ -511,8 +511,8 @@ void exp32f( const float *_x, float *y, int n ) zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); - zf0 *= yf0; - zf1 *= yf1; + zf0 = v_mul(zf0, yf0); + zf1 = v_mul(zf1, yf1); if( y_aligned ) { @@ -566,7 +566,7 @@ void exp64f( const double *_x, double *y, int n ) double maxval = (exp_max_val/exp_prescale); #if CV_SIMD_64F - const int VECSZ = v_float64::nlanes; + const int VECSZ = VTraits::vlanes(); const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vminval = vx_setall_f64(minval); @@ -596,30 +596,30 @@ void exp64f( const double *_x, double *y, int n ) xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); - xf0 *= vprescale; - xf1 *= vprescale; + xf0 = v_mul(xf0, vprescale); + xf1 = v_mul(xf1, vprescale); v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); - xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; - xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; + xf0 = v_mul(v_sub(xf0, v_cvt_f64(xi0)), vpostscale); + xf1 = v_mul(v_sub(xf1, v_cvt_f64(xi1)), vpostscale); - v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); - v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); + v_float64 yf0 = v_lut(expTab, v_and(xi0, vidxmask)); + v_float64 yf1 = v_lut(expTab, v_and(xi1, vidxmask)); v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); - xi0 = v_min(v_max(v_shr(xi0) + v1023, v0), v2047); - xi1 = v_min(v_max(v_shr(xi1) + v1023, v0), v2047); + xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v1023), v0), v2047); + xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v1023), v0), v2047); v_int64 xq0, xq1, dummy; v_expand(xi0, xq0, dummy); v_expand(xi1, xq1, dummy); - yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); - yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); + yf0 = v_mul(yf0, v_reinterpret_as_f64(v_shl<52>(xq0))); + yf1 = v_mul(yf1, v_reinterpret_as_f64(v_shl<52>(xq1))); - v_float64 zf0 = xf0 + vA1; - v_float64 zf1 = xf1 + vA1; + v_float64 zf0 = v_add(xf0, vA1); + v_float64 zf1 = v_add(xf1, vA1); zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); @@ -633,8 +633,8 @@ void exp64f( const double *_x, double *y, int n ) zf0 = v_fma(zf0, xf0, vA5); zf1 = v_fma(zf1, xf1, vA5); - zf0 *= yf0; - zf1 *= yf1; + zf0 = v_mul(zf0, yf0); + zf1 = v_mul(zf1, yf1); if( y_aligned ) { @@ -696,7 +696,7 @@ void log32f( const float *_x, float *y, int n ) const int* x = (const int*)_x; #if CV_SIMD - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); const v_float32 vln2 = vx_setall_f32((float)ln_2); const v_float32 v1 = vx_setall_f32(1.f); const v_float32 vshift = vx_setall_f32(-1.f/512); @@ -715,18 +715,18 @@ void log32f( const float *_x, float *y, int n ) } v_int32 h0 = vx_load(x + i); - v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127); - v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23); + v_int32 yi0 = v_sub(v_and(v_shr<23>(h0), vx_setall_s32(255)), vx_setall_s32(127)); + v_int32 xi0 = v_or(v_and(h0, vx_setall_s32(LOGTAB_MASK2_32F)), vx_setall_s32(127 << 23)); - h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2); + h0 = v_and(v_shr<23 - 8 - 1>(h0), vx_setall_s32(((1 << 8) - 1) * 2)); v_float32 yf0, xf0; v_lut_deinterleave(logTab_f, h0, yf0, xf0); yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0); - v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall(0)); - xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta); + v_float32 delta = v_select(v_reinterpret_as_f32(v_eq(h0, vx_setall_s32(510))), vshift, vx_setall(0)); + xf0 = v_fma((v_sub(v_reinterpret_as_f32(xi0), v1)), xf0, delta); v_float32 zf0 = v_fma(xf0, vA0, vA1); zf0 = v_fma(zf0, xf0, vA2); @@ -771,7 +771,7 @@ void log64f( const double *x, double *y, int n ) int i = 0; #if CV_SIMD_64F - const int VECSZ = v_float64::nlanes; + const int VECSZ = VTraits::vlanes(); const v_float64 vln2 = vx_setall_f64(ln_2); const v_float64 @@ -791,20 +791,20 @@ void log64f( const double *x, double *y, int n ) v_int64 h0 = vx_load((const int64*)x + i); v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64()); - yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023); + yi0 = v_sub(v_and(yi0, vx_setall_s32(2047)), vx_setall_s32(1023)); - v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52); + v_int64 xi0 = v_or(v_and(h0, vx_setall_s64(LOGTAB_MASK2_64F)), vx_setall_s64((int64)1023 << 52)); h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0); - v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2); + v_int32 idx = v_and(v_pack(h0, h0), vx_setall_s32(((1 << 8) - 1) * 2)); v_float64 xf0, yf0; v_lut_deinterleave(logTab, idx, yf0, xf0); yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0); - v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512); - xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta); + v_float64 delta = v_mul(v_cvt_f64(v_eq(idx, vx_setall_s32(510))), vx_setall_f64(1. / 512)); + xf0 = v_fma(v_sub(v_reinterpret_as_f64(xi0), vx_setall_f64(1.)), xf0, delta); - v_float64 xq = xf0*xf0; + v_float64 xq = v_mul(xf0, xf0); v_float64 zf0 = v_fma(xq, vA0, vA2); v_float64 zf1 = v_fma(xq, vA1, vA3); zf0 = v_fma(zf0, xq, vA4); diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 3a9dbd9be8..058666485a 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -1584,7 +1584,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, v_float32x4 _m2h = v_rotate_left<1>(_m2l); v_float32x4 _m3h = v_rotate_left<1>(_m3l); v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); - for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 ) + for( ; x <= len*3 - VTraits::vlanes(); x += 3*VTraits::vlanes()/4 ) v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack( v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)), v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta)))); @@ -1664,10 +1664,10 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i v_float32x4 _m2 = v_load(m + 10); v_float32x4 _m3 = v_load(m + 15); v_float32x4 _m4(m[4], m[9], m[14], m[19]); - for( ; x < len*4; x += v_float32x4::nlanes ) + for( ; x < len*4; x += VTraits::vlanes() ) { v_float32x4 v_src = v_load(src + x); - v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4); + v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, _m0), v_mul(v_src, _m1), v_mul(v_src, _m2), v_mul(v_src, _m3)), _m4)); } #else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128 for( ; x < len*4; x += 4 ) @@ -2113,12 +2113,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double for( k = 0; k < size.height; k++, tsrc += srcstep ) { v_float64x2 a = v_setall_f64((double)col_buf[k]); - s0 += a * v_load(tsrc+0); - s1 += a * v_load(tsrc+2); + s0 = v_add(s0, v_mul(a, v_load(tsrc + 0))); + s1 = v_add(s1, v_mul(a, v_load(tsrc + 2))); } - v_store((double*)(tdst+j), s0*v_scale); - v_store((double*)(tdst+j+2), s1*v_scale); + v_store((double*)(tdst+j), v_mul(s0, v_scale)); + v_store((double*)(tdst+j+2), v_mul(s1, v_scale)); } else #endif { @@ -2174,12 +2174,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) { v_float64x2 a = v_setall_f64((double)col_buf[k]); - s0 += a * (v_load(tsrc+0) - v_load(d+0)); - s1 += a * (v_load(tsrc+2) - v_load(d+2)); + s0 = v_add(s0, v_mul(a, v_sub(v_load(tsrc + 0), v_load(d + 0)))); + s1 = v_add(s1, v_mul(a, v_sub(v_load(tsrc + 2), v_load(d + 2)))); } - v_store((double*)(tdst+j), s0*v_scale); - v_store((double*)(tdst+j+2), s1*v_scale); + v_store((double*)(tdst+j), v_mul(s0, v_scale)); + v_store((double*)(tdst+j+2), v_mul(s1, v_scale)); } else #endif @@ -2249,8 +2249,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double v_float64x2 v_s = v_setzero_f64(); for( k = 0; k <= size.width - 4; k += 4 ) - v_s += (v_load(v_tsrc1+k) * v_load(v_tsrc2+k)) + - (v_load(v_tsrc1+k+2) * v_load(v_tsrc2+k+2)); + v_s = v_add(v_s, v_add(v_mul(v_load(v_tsrc1 + k), v_load(v_tsrc2 + k)), v_mul(v_load(v_tsrc1 + k + 2), v_load(v_tsrc2 + k + 2)))); s += v_reduce_sum(v_s); } else @@ -2303,8 +2302,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double v_float64x2 v_s = v_setzero_f64(); for( k = 0; k <= size.width - 4; k += 4, v_tdelta2 += delta_shift ) - v_s += ((v_load(v_tsrc2+k) - v_load(v_tdelta2)) * v_load(v_row_buf+k)) + - ((v_load(v_tsrc2+k+2) - v_load(v_tdelta2+2)) * v_load(v_row_buf+k+2)); + v_s = v_add(v_s, v_add(v_mul(v_sub(v_load(v_tsrc2 + k), v_load(v_tdelta2)), v_load(v_row_buf + k)), v_mul(v_sub(v_load(v_tsrc2 + k + 2), v_load(v_tdelta2 + 2)), v_load(v_row_buf + k + 2)))); s += v_reduce_sum(v_s); tdelta2 = (const dT *)(v_tdelta2); @@ -2566,7 +2564,7 @@ double dotProd_32s(const int* src1, const int* src2, int len) v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0); v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1); } - v_sum0 += v_sum1; + v_sum0 = v_add(v_sum0, v_sum1); #endif for (; i < len - step; i += step, src1 += step, src2 += step) { diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp index c4c7a73b4c..5a80ac8ca7 100644 --- a/modules/core/src/matrix_transform.cpp +++ b/modules/core/src/matrix_transform.cpp @@ -356,10 +356,10 @@ void transposeND(InputArray src_, const std::vector& order, OutputArray dst #if CV_SIMD128 template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) { - typedef typename V::lane_type T; + typedef typename VTraits::lane_type T; int end = (int)(size.width*esz); int width = (end + 1)/2; - int width_1 = width & -v_uint8x16::nlanes; + int width_1 = width & -VTraits::vlanes(); int i, j; #if CV_STRONG_ALIGNMENT @@ -368,15 +368,15 @@ template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s for( ; size.height--; src += sstep, dst += dstep ) { - for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) + for( i = 0, j = end; i < width_1; i += VTraits::vlanes(), j -= VTraits::vlanes() ) { V t0, t1; t0 = v_load((T*)((uchar*)src + i)); - t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes)); + t1 = v_load((T*)((uchar*)src + j - VTraits::vlanes())); t0 = v_reverse(t0); t1 = v_reverse(t1); - v_store((T*)(dst + j - v_uint8x16::nlanes), t0); + v_store((T*)(dst + j - VTraits::vlanes()), t0); v_store((T*)(dst + i), t1); } if (isAligned(src, dst)) @@ -446,14 +446,14 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, #if CV_STRONG_ALIGNMENT size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep; #endif - if (esz == 2 * v_uint8x16::nlanes) + if (esz == 2 * (size_t)VTraits::vlanes()) { int end = (int)(size.width*esz); int width = end/2; for( ; size.height--; src += sstep, dst += dstep ) { - for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes ) + for( int i = 0, j = end - 2 * VTraits::vlanes(); i < width; i += 2 * VTraits::vlanes(), j -= 2 * VTraits::vlanes() ) { #if CV_SIMD256 v_uint8x32 t0, t1; @@ -466,25 +466,25 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, v_uint8x16 t0, t1, t2, t3; t0 = v_load((uchar*)src + i); - t1 = v_load((uchar*)src + i + v_uint8x16::nlanes); + t1 = v_load((uchar*)src + i + VTraits::vlanes()); t2 = v_load((uchar*)src + j); - t3 = v_load((uchar*)src + j + v_uint8x16::nlanes); + t3 = v_load((uchar*)src + j + VTraits::vlanes()); v_store(dst + j, t0); - v_store(dst + j + v_uint8x16::nlanes, t1); + v_store(dst + j + VTraits::vlanes(), t1); v_store(dst + i, t2); - v_store(dst + i + v_uint8x16::nlanes, t3); + v_store(dst + i + VTraits::vlanes(), t3); #endif } } } - else if (esz == v_uint8x16::nlanes) + else if (esz == (size_t)VTraits::vlanes()) { int end = (int)(size.width*esz); int width = end/2; for( ; size.height--; src += sstep, dst += dstep ) { - for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) + for( int i = 0, j = end - VTraits::vlanes(); i < width; i += VTraits::vlanes(), j -= VTraits::vlanes() ) { v_uint8x16 t0, t1; @@ -534,19 +534,19 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, for( ; size.height--; src += sstep, dst += dstep ) { - for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) ) + for ( int i = 0, j = end; i < width; i += VTraits::vlanes() + sizeof(uint64_t), j -= VTraits::vlanes() + sizeof(uint64_t) ) { v_uint8x16 t0, t1; uint64_t t2, t3; t0 = v_load((uchar*)src + i); - t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes)); - t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t)); + t2 = *((uint64_t*)((uchar*)src + i + VTraits::vlanes())); + t1 = v_load((uchar*)src + j - VTraits::vlanes() - sizeof(uint64_t)); t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t))); - v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0); + v_store(dst + j - VTraits::vlanes() - sizeof(uint64_t), t0); *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2; v_store(dst + i, t1); - *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3; + *((uint64_t*)(dst + i + VTraits::vlanes())) = t3; } } } diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp index 3a5be11a37..ff3786886e 100644 --- a/modules/core/src/minmax.cpp +++ b/modules/core/src/minmax.cpp @@ -141,7 +141,7 @@ CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a) CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b) { - return b ^ ((a ^ b) & mask); + return v_xor(b, v_and(v_xor(a, b), mask)); } #endif @@ -151,16 +151,16 @@ minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &n T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \ size_t delta ) \ { \ - if ( v_check_any(idxMin != none) ) \ + if ( v_check_any(v_ne(idxMin, none)) ) \ { \ minVal = v_reduce_min(valMin); \ - minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \ + minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)minVal), valMin)), \ idxMin, v_setall_##suffix2(maxLimit))) + delta; \ } \ - if ( v_check_any(idxMax != none) ) \ + if ( v_check_any(v_ne(idxMax, none)) ) \ { \ maxVal = v_reduce_max(valMax); \ - maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \ + maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)maxVal), valMax)), \ idxMax, v_setall_##suffix2(maxLimit))) + delta; \ } \ } @@ -210,18 +210,18 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= v_uint8x16::nlanes ) + if ( len >= VTraits::vlanes() ) { int j, len0; int minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - (int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 ); + (int)0, (int)UCHAR_MAX, VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - v_uint8x16::nlanes ) + if ( j <= len0 - VTraits::vlanes() ) { - v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes); + v_uint8x16 inc = v_setall_u8((uchar)VTraits::vlanes()); v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1)); v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -235,31 +235,31 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* if ( !mask ) { - for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes ) + for( ; k < std::min(len0, j + 15 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_uint8x16 data = v_load(src + k); - v_uint8x16 cmpMin = (data < valMin); - v_uint8x16 cmpMax = (data > valMax); + v_uint8x16 cmpMin = (v_lt(data, valMin)); + v_uint8x16 cmpMax = (v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes ) + for( ; k < std::min(len0, j + 15 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_uint8x16 data = v_load(src + k); - v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8(); - v_uint8x16 cmpMin = (data < valMin) & maskVal; - v_uint8x16 cmpMax = (data > valMax) & maskVal; + v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8()); + v_uint8x16 cmpMin = v_and(v_lt(data, valMin), maskVal); + v_uint8x16 cmpMax = v_and(v_gt(data, valMax), maskVal); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(cmpMin, data, valMin); valMax = v_select(cmpMax, data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -287,18 +287,18 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= v_int8x16::nlanes ) + if ( len >= VTraits::vlanes() ) { int j, len0; int minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - (int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 ); + (int)SCHAR_MIN, (int)SCHAR_MAX, VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - v_int8x16::nlanes ) + if ( j <= len0 - VTraits::vlanes() ) { - v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes); + v_uint8x16 inc = v_setall_u8((uchar)VTraits::vlanes()); v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1)); v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -312,31 +312,31 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* if ( !mask ) { - for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes ) + for( ; k < std::min(len0, j + 15 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_int8x16 data = v_load(src + k); - v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin); - v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax); + v_uint8x16 cmpMin = v_reinterpret_as_u8(v_lt(data, valMin)); + v_uint8x16 cmpMax = v_reinterpret_as_u8(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes ) + for( ; k < std::min(len0, j + 15 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_int8x16 data = v_load(src + k); - v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8(); - v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal; - v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal; + v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8()); + v_uint8x16 cmpMin = v_and(v_reinterpret_as_u8(v_lt(data, valMin)), maskVal); + v_uint8x16 cmpMax = v_and(v_reinterpret_as_u8(v_gt(data, valMax)), maskVal); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -364,18 +364,18 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= v_uint16x8::nlanes ) + if ( len >= VTraits::vlanes() ) { int j, len0; int minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - (int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 ); + (int)0, (int)USHRT_MAX, VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - v_uint16x8::nlanes ) + if ( j <= len0 - VTraits::vlanes() ) { - v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes); + v_uint16x8 inc = v_setall_u16((uchar)VTraits::vlanes()); v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1)); v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7); @@ -389,31 +389,31 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int if ( !mask ) { - for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes ) + for( ; k < std::min(len0, j + 8191 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_uint16x8 data = v_load(src + k); - v_uint16x8 cmpMin = (data < valMin); - v_uint16x8 cmpMax = (data > valMax); + v_uint16x8 cmpMin = (v_lt(data, valMin)); + v_uint16x8 cmpMax = (v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes ) + for( ; k < std::min(len0, j + 8191 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_uint16x8 data = v_load(src + k); - v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); - v_uint16x8 cmpMin = (data < valMin) & maskVal; - v_uint16x8 cmpMax = (data > valMax) & maskVal; + v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16()); + v_uint16x8 cmpMin = v_and(v_lt(data, valMin), maskVal); + v_uint16x8 cmpMax = v_and(v_gt(data, valMax), maskVal); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(cmpMin, data, valMin); valMax = v_select(cmpMax, data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -441,18 +441,18 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= v_int16x8::nlanes ) + if ( len >= VTraits::vlanes() ) { int j, len0; int minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - (int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 ); + (int)SHRT_MIN, (int)SHRT_MAX, VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - v_int16x8::nlanes ) + if ( j <= len0 - VTraits::vlanes() ) { - v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes); + v_uint16x8 inc = v_setall_u16((uchar)VTraits::vlanes()); v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1)); v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7); @@ -466,31 +466,31 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* if ( !mask ) { - for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes ) + for( ; k < std::min(len0, j + 8191 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_int16x8 data = v_load(src + k); - v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin); - v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax); + v_uint16x8 cmpMin = v_reinterpret_as_u16(v_lt(data, valMin)); + v_uint16x8 cmpMax = v_reinterpret_as_u16(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes ) + for( ; k < std::min(len0, j + 8191 * VTraits::vlanes()); k += VTraits::vlanes() ) { v_int16x8 data = v_load(src + k); - v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); - v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal; - v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal; + v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16()); + v_uint16x8 cmpMin = v_and(v_reinterpret_as_u16(v_lt(data, valMin)), maskVal); + v_uint16x8 cmpMax = v_and(v_reinterpret_as_u16(v_gt(data, valMax)), maskVal); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -518,14 +518,14 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= 2 * v_int32x4::nlanes ) + if ( len >= 2 * VTraits::vlanes() ) { - int j = 0, len0 = len & -(2 * v_int32x4::nlanes); + int j = 0, len0 = len & -(2 * VTraits::vlanes()); int minVal = *minval, maxVal = *maxval; size_t minIdx = *minidx, maxIdx = *maxidx; { - v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes); + v_uint32x4 inc = v_setall_u32(VTraits::vlanes()); v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1)); v_uint32x4 idxStart(0, 1, 2, 3); @@ -539,49 +539,49 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m if ( !mask ) { - for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes ) + for( ; k < std::min(len0, j + 32766 * 2 * VTraits::vlanes()); k += 2 * VTraits::vlanes() ) { v_int32x4 data = v_load(src + k); - v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin); - v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax); + v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin)); + v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; - data = v_load(src + k + v_int32x4::nlanes); - cmpMin = v_reinterpret_as_u32(data < valMin); - cmpMax = v_reinterpret_as_u32(data > valMax); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u32(v_lt(data, valMin)); + cmpMax = v_reinterpret_as_u32(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes ) + for( ; k < std::min(len0, j + 32766 * 2 * VTraits::vlanes()); k += 2 * VTraits::vlanes() ) { v_int32x4 data = v_load(src + k); - v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16()); v_int32x4 maskVal1, maskVal2; v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); - v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1); - v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1); + v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal1)); + v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal1)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax); - idx += inc; - data = v_load(src + k + v_int32x4::nlanes); - cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2); - cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal2)); + cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal2)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -609,18 +609,18 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128 - if ( len >= 2 * v_float32x4::nlanes ) + if ( len >= 2 * VTraits::vlanes() ) { int j, len0; float minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 ); + FLT_MIN, FLT_MAX, 2 * VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - 2 * v_float32x4::nlanes ) + if ( j <= len0 - 2 * VTraits::vlanes() ) { - v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes); + v_uint32x4 inc = v_setall_u32(VTraits::vlanes()); v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1)); v_uint32x4 idxStart(0, 1, 2, 3); @@ -634,49 +634,49 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl if ( !mask ) { - for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes ) + for( ; k < std::min(len0, j + 32766 * 2 * VTraits::vlanes()); k += 2 * VTraits::vlanes() ) { v_float32x4 data = v_load(src + k); - v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin); - v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax); + v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin)); + v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; - data = v_load(src + k + v_float32x4::nlanes); - cmpMin = v_reinterpret_as_u32(data < valMin); - cmpMax = v_reinterpret_as_u32(data > valMax); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u32(v_lt(data, valMin)); + cmpMax = v_reinterpret_as_u32(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes ) + for( ; k < std::min(len0, j + 32766 * 2 * VTraits::vlanes()); k += 2 * VTraits::vlanes() ) { v_float32x4 data = v_load(src + k); - v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16()); v_int32x4 maskVal1, maskVal2; v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); - v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1); - v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1); + v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal1)); + v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal1)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax); - idx += inc; - data = v_load(src + k + v_float32x4::nlanes); - cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2); - cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal2)); + cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal2)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax); - idx += inc; + idx = v_add(idx, inc); } } @@ -704,18 +704,18 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) { #if CV_SIMD128_64F - if ( len >= 4 * v_float64x2::nlanes ) + if ( len >= 4 * VTraits::vlanes() ) { int j, len0; double minVal, maxVal; size_t minIdx, maxIdx; minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, - DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 ); + DBL_MIN, DBL_MAX, 4 * VTraits::vlanes(), len, startidx, j, len0 ); - if ( j <= len0 - 4 * v_float64x2::nlanes ) + if ( j <= len0 - 4 * VTraits::vlanes() ) { - v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes); + v_uint64x2 inc = v_setall_u64(VTraits::vlanes()); v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1)); v_uint64x2 idxStart(0, 1); @@ -729,84 +729,84 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, if ( !mask ) { - for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes ) + for( ; k < std::min(len0, j + 32764 * 4 * VTraits::vlanes()); k += 4 * VTraits::vlanes() ) { v_float64x2 data = v_load(src + k); - v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin); - v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax); + v_uint64x2 cmpMin = v_reinterpret_as_u64(v_lt(data, valMin)); + v_uint64x2 cmpMax = v_reinterpret_as_u64(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; - data = v_load(src + k + v_float64x2::nlanes); - cmpMin = v_reinterpret_as_u64(data < valMin); - cmpMax = v_reinterpret_as_u64(data > valMax); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u64(v_lt(data, valMin)); + cmpMax = v_reinterpret_as_u64(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; - data = v_load(src + k + 2 * v_float64x2::nlanes); - cmpMin = v_reinterpret_as_u64(data < valMin); - cmpMax = v_reinterpret_as_u64(data > valMax); + idx = v_add(idx, inc); + data = v_load(src + k + 2 * VTraits::vlanes()); + cmpMin = v_reinterpret_as_u64(v_lt(data, valMin)); + cmpMax = v_reinterpret_as_u64(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; - data = v_load(src + k + 3 * v_float64x2::nlanes); - cmpMin = v_reinterpret_as_u64(data < valMin); - cmpMax = v_reinterpret_as_u64(data > valMax); + idx = v_add(idx, inc); + data = v_load(src + k + 3 * VTraits::vlanes()); + cmpMin = v_reinterpret_as_u64(v_lt(data, valMin)); + cmpMax = v_reinterpret_as_u64(v_gt(data, valMax)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_min(data, valMin); valMax = v_max(data, valMax); - idx += inc; + idx = v_add(idx, inc); } } else { - for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes ) + for( ; k < std::min(len0, j + 32764 * 4 * VTraits::vlanes()); k += 4 * VTraits::vlanes() ) { v_float64x2 data = v_load(src + k); - v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16()); v_int32x4 maskVal1, maskVal2; v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); v_int64x2 maskVal3, maskVal4; v_expand(maskVal1, maskVal3, maskVal4); - v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3); - v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3); + v_uint64x2 cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3)); + v_uint64x2 cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); - idx += inc; - data = v_load(src + k + v_float64x2::nlanes); - cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4); - cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4); + idx = v_add(idx, inc); + data = v_load(src + k + VTraits::vlanes()); + cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4)); + cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); - idx += inc; - data = v_load(src + k + 2 * v_float64x2::nlanes); + idx = v_add(idx, inc); + data = v_load(src + k + 2 * VTraits::vlanes()); v_expand(maskVal2, maskVal3, maskVal4); - cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3); - cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3); + cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3)); + cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); - idx += inc; - data = v_load(src + k + 3 * v_float64x2::nlanes); - cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4); - cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4); + idx = v_add(idx, inc); + data = v_load(src + k + 3 * VTraits::vlanes()); + cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4)); + cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4)); idxMin = v_select(cmpMin, idx, idxMin); idxMax = v_select(cmpMax, idx, idxMax); valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); - idx += inc; + idx = v_add(idx, inc); } } diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 1ece6de82f..38b8d10f7b 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1745,13 +1745,8 @@ template struct TheTest R a = dataA; R b = dataB; -#if CV_SIMD_SCALABLE Data dataEQ = v_eq(a, b); Data dataNE = v_ne(a, b); -#else - Data dataEQ = (a == b); - Data dataNE = (a != b); -#endif for (int i = 0; i < VTraits::vlanes(); ++i) { diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp index 60301a406c..ba9b31fe35 100644 --- a/modules/dnn/src/int8layers/convolution_layer.cpp +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -29,10 +29,10 @@ static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b, v_int32x4 t0, t1; v_mul_expand(a0, b0, t0, t1); - out0 += t0; out1 += t1; + out0 = v_add(out0, t0); out1 = v_add(out1, t1); v_mul_expand(a1, b1, t0, t1); - out2 += t0; out3 += t1; + out2 = v_add(out2, t0); out3 = v_add(out3, t1); } #endif @@ -1055,10 +1055,10 @@ public: v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); - vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult); - vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult); - vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult); - vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult); + vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult))); + vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult))); + vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult))); + vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult))); vout0 = v_min(v_max(vout0, outmin), outmax); vout1 = v_min(v_max(vout1, outmin), outmax); @@ -1408,12 +1408,12 @@ public: vs12 = v_dotprod_expand_fast(w1, r2, vs12); vs13 = v_dotprod_expand_fast(w1, r3, vs13); } - s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)); - s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)); + s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03))); + s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13))); if( cn1 == inpCn ) { - s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0); - s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1); + s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0))); + s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1))); s0 = v_min(v_max(s0, outmin), outmax); s1 = v_min(v_max(s1, outmin), outmax); diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp index b8e3bd6ee5..ba5b0d79c1 100644 --- a/modules/dnn/src/int8layers/fully_connected_layer.cpp +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -323,8 +323,8 @@ public: vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3); } - s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)); - v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult); + s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3))); + v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult))); v_store(dptr + i, v_min(v_max(out, outmin), outmax)); } #endif diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp index bfff3d34c5..b321d730f7 100644 --- a/modules/dnn/src/int8layers/pooling_layer.cpp +++ b/modules/dnn/src/int8layers/pooling_layer.cpp @@ -631,17 +631,17 @@ public: (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]); v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13], (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]); - sum_val0 += v0; - sum_val1 += v1; - sum_val2 += v2; - sum_val3 += v3; + sum_val0 = v_add(sum_val0, v0); + sum_val1 = v_add(sum_val1, v1); + sum_val2 = v_add(sum_val2, v2); + sum_val3 = v_add(sum_val3, v3); } } - sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp; - sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp; - sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp; - sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp; + sum_val0 = v_add(v_round(v_mul(v_cvt_f32(sum_val0), ikarea)), voutzp); + sum_val1 = v_add(v_round(v_mul(v_cvt_f32(sum_val1), ikarea)), voutzp); + sum_val2 = v_add(v_round(v_mul(v_cvt_f32(sum_val2), ikarea)), voutzp); + sum_val3 = v_add(v_round(v_mul(v_cvt_f32(sum_val3), ikarea)), voutzp); v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3))); x0 += 15; diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp index 3e969336ad..59f069eeaa 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp @@ -236,13 +236,11 @@ void depthWiseBlockConv2D(const float* wptr, v21 = v_load(imgptr2 + in_j + dilation_w), v22 = v_load(imgptr2 + in_j + dilation_w*2); - v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + - v10*vw10 + v11*vw11 + v12*vw12 + - v20*vw20 + v21*vw21 + v22*vw22 + vbias; + v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias); if (fusedAdd) - vout = v_load(outptr + out_j) + vout; + vout = v_add(v_load(outptr + out_j), vout); if (relu) - vout = v_select(vout > z, vout, vout*vrc); + vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc)); v_store(outptr + out_j, vout); } } @@ -268,14 +266,12 @@ void depthWiseBlockConv2D(const float* wptr, v_load_deinterleave(imgptr2 + in_j, v20, v21); v_load_deinterleave(imgptr2 + in_j + 2, v22, unused); - v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + - v10 * vw10 + v11 * vw11 + v12 * vw12 + - v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias; + v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias); if (fusedAdd) - vout = v_load(outptr + out_j) + vout; + vout = v_add(v_load(outptr + out_j), vout); if (relu) - vout = v_select(vout > z, vout, vout*vrc); + vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc)); v_store(outptr + out_j, vout); } } @@ -381,11 +377,11 @@ void depthWiseBlockConv1D(const float* wptr, v01 = v_load(imgptr0 + in_j + dilation_w), v02 = v_load(imgptr0 + in_j + dilation_w*2); - v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias; + v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias); if (fusedAdd) - vout = v_load(outptr + out_j) + vout; + vout = v_add(v_load(outptr + out_j), vout); if (relu) - vout = v_select(vout > z, vout, vout*vrc); + vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc)); v_store(outptr + out_j, vout); } } @@ -407,13 +403,13 @@ void depthWiseBlockConv1D(const float* wptr, v_load_deinterleave(imgptr0 + in_j, v00, v01); v_load_deinterleave(imgptr0 + in_j + 2, v02, unused); - v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias; + v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias); if (fusedAdd) - vout = v_load(outptr + out_j) + vout; + vout = v_add(v_load(outptr + out_j), vout); if (relu) - vout = v_select(vout > z, vout, vout*vrc); + vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc)); v_store(outptr + out_j, vout); } } diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp index a18943994c..605cf37949 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp @@ -430,32 +430,32 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; - t00 = x40 - x20; - t01 = x41 - x21; - t10 = x30 - x50; - t11 = x31 - x51; - v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60); - v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61); - v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10); - v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11); + t00 = v_sub(x40, x20); + t01 = v_sub(x41, x21); + t10 = v_sub(x30, x50); + t11 = v_sub(x31, x51); + v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60)); + v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61)); + v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10)); + v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11)); /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ v_float32x4 qm4_25 = v_setall_f32(-4.25f); - t00 = v_fma(x30, qm4_25, x10 + x50); - t01 = v_fma(x31, qm4_25, x11 + x51); - t10 = v_fma(x40, qm4_25, x20 + x60); - t11 = v_fma(x41, qm4_25, x21 + x61); + t00 = v_fma(x30, qm4_25, v_add(x10, x50)); + t01 = v_fma(x31, qm4_25, v_add(x11, x51)); + t10 = v_fma(x40, qm4_25, v_add(x20, x60)); + t11 = v_fma(x41, qm4_25, v_add(x21, x61)); - v_float32x4 y10 = t00 + t10, y11 = t01 + t11; - v_float32x4 y20 = t10 - t00, y21 = t11 - t01; + v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11); + v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01); /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); - t00 = v_fma(x10, q0_5, x50 + x50); - t01 = v_fma(x11, q0_5, x51 + x51); + t00 = v_fma(x10, q0_5, v_add(x50, x50)); + t01 = v_fma(x11, q0_5, v_add(x51, x51)); t10 = v_fma(x20, q0_25, x60); t11 = v_fma(x21, q0_25, x61); t00 = v_fma(x30, qm2_5, t00); @@ -463,14 +463,14 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, t10 = v_fma(x40, qm1_25, t10); t11 = v_fma(x41, qm1_25, t11); - v_float32x4 y30 = t00 + t10, y31 = t01 + t11; - v_float32x4 y40 = t10 - t00, y41 = t11 - t01; + v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11); + v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01); /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); - t00 = v_fma(x50, q0_5, x10 + x10); - t01 = v_fma(x51, q0_5, x11 + x11); + t00 = v_fma(x50, q0_5, v_add(x10, x10)); + t01 = v_fma(x51, q0_5, v_add(x11, x11)); t10 = v_fma(x20, q4 , x60); t11 = v_fma(x21, q4 , x61); t00 = v_fma(x30, qm2_5, t00); @@ -478,8 +478,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, t10 = v_fma(x40, qm5 , t10); t11 = v_fma(x41, qm5 , t11); - v_float32x4 y50 = t00 + t10, y51 = t01 + t11; - v_float32x4 y60 = t10 - t00, y61 = t11 - t01; + v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11); + v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01); /* transpose 8x8 matrix with v_transpose4x4 */ @@ -491,29 +491,29 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = y010 - y200; - t01 = y410 - y600; - t10 = y300 - y110; - t11 = y700 - y510; - z00 = v_fma(t00, q5_25, y000 - y210); - z01 = v_fma(t01, q5_25, y400 - y610); - z70 = v_fma(t10, q5_25, y310 - y100); - z71 = v_fma(t11, q5_25, y710 - y500); + t00 = v_sub(y010, y200); + t01 = v_sub(y410, y600); + t10 = v_sub(y300, y110); + t11 = v_sub(y700, y510); + z00 = v_fma(t00, q5_25, v_sub(y000, y210)); + z01 = v_fma(t01, q5_25, v_sub(y400, y610)); + z70 = v_fma(t10, q5_25, v_sub(y310, y100)); + z71 = v_fma(t11, q5_25, v_sub(y710, y500)); /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = v_fma(y300, qm4_25, y100 + y110); - t01 = v_fma(y700, qm4_25, y500 + y510); - t10 = v_fma(y010, qm4_25, y200 + y210); - t11 = v_fma(y410, qm4_25, y600 + y610); + t00 = v_fma(y300, qm4_25, v_add(y100, y110)); + t01 = v_fma(y700, qm4_25, v_add(y500, y510)); + t10 = v_fma(y010, qm4_25, v_add(y200, y210)); + t11 = v_fma(y410, qm4_25, v_add(y600, y610)); - z10 = t00 + t10; z11 = t01 + t11; - z20 = t10 - t00; z21 = t11 - t01; + z10 = v_add(t00, t10); z11 = v_add(t01, t11); + z20 = v_sub(t10, t00); z21 = v_sub(t11, t01); /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = v_fma(y100, q0_5, y110 + y110); - t01 = v_fma(y500, q0_5, y510 + y510); + t00 = v_fma(y100, q0_5, v_add(y110, y110)); + t01 = v_fma(y500, q0_5, v_add(y510, y510)); t10 = v_fma(y200, q0_25, y210); t11 = v_fma(y600, q0_25, y610); t00 = v_fma(y300, qm2_5, t00); @@ -521,13 +521,13 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, t10 = v_fma(y010, qm1_25, t10); t11 = v_fma(y410, qm1_25, t11); - z30 = t00 + t10; z31 = t01 + t11; - z40 = t10 - t00; z41 = t11 - t01; + z30 = v_add(t00, t10); z31 = v_add(t01, t11); + z40 = v_sub(t10, t00); z41 = v_sub(t11, t01); /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = v_fma(y110, q0_5, y100 + y100); - t01 = v_fma(y510, q0_5, y500 + y500); + t00 = v_fma(y110, q0_5, v_add(y100, y100)); + t01 = v_fma(y510, q0_5, v_add(y500, y500)); t10 = v_fma(y200, q4, y210); t11 = v_fma(y600, q4, y610); t00 = v_fma(y300, qm2_5, t00); @@ -535,8 +535,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, t10 = v_fma(y010, qm5, t10); t11 = v_fma(y410, qm5, t11); - z50 = t00 + t10; z51 = t01 + t11; - z60 = t10 - t00; z61 = t11 - t01; + z50 = v_add(t00, t10); z51 = v_add(t01, t11); + z60 = v_sub(t10, t00); z61 = v_sub(t11, t01); } const int outstep = winoIblock*winoAtomF32*Cg; @@ -601,12 +601,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, { v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; - s12_0 = x10 + x20; s12_1 = x11 + x21; - s34_0 = x30 + x40; s34_1 = x31 + x41; - s56_0 = x50 + x60; s56_1 = x51 + x61; + s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21); + s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41); + s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61); - v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0; - v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1; + v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0); + v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1); v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); @@ -616,13 +616,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - s12_0 = x10 - x20; s12_1 = x11 - x21; - s34_0 = x30 - x40; s34_1 = x31 - x41; - s56_0 = x50 - x60; s56_1 = x51 - x61; + s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21); + s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41); + s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61); a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); - v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0)); - v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1)); + v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0))); + v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1))); a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); @@ -642,12 +642,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700); v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710); - s12_0 = y100 + y200; s12_1 = y500 + y600; - s34_0 = y300 + y010; s34_1 = y700 + y410; - s56_0 = y110 + y210; s56_1 = y510 + y610; + s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600); + s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410); + s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610); - z00 = y000 + s12_0 + s34_0 + s56_0; - z01 = y400 + s12_1 + s34_1 + s56_1; + z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0); + z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1); a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); @@ -657,13 +657,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - s12_0 = y100 - y200; s12_1 = y500 - y600; - s34_0 = y300 - y010; s34_1 = y700 - y410; - s56_0 = y110 - y210; s56_1 = y510 - y610; + s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600); + s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410); + s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610); a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); - z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y310 + s12_0)); - z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y710 + s12_1)); + z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0))); + z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1))); a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); @@ -673,34 +673,34 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); v_float32x4 vbias = v_setall_f32(bias); - z00 += vbias; - z01 += vbias; - z10 += vbias; - z11 += vbias; - z20 += vbias; - z21 += vbias; - z30 += vbias; - z31 += vbias; - z40 += vbias; - z41 += vbias; - z50 += vbias; - z51 += vbias; + z00 = v_add(z00, vbias); + z01 = v_add(z01, vbias); + z10 = v_add(z10, vbias); + z11 = v_add(z11, vbias); + z20 = v_add(z20, vbias); + z21 = v_add(z21, vbias); + z30 = v_add(z30, vbias); + z31 = v_add(z31, vbias); + z40 = v_add(z40, vbias); + z41 = v_add(z41, vbias); + z50 = v_add(z50, vbias); + z51 = v_add(z51, vbias); } if (bpptr) { - z00 += v_load(bpptr); - z01 += v_load_low(bpptr + 4); - z10 += v_load(bpptr + bpstep); - z11 += v_load_low(bpptr + bpstep + 4); - z20 += v_load(bpptr + bpstep*2); - z21 += v_load_low(bpptr + bpstep*2 + 4); - z30 += v_load(bpptr + bpstep*3); - z31 += v_load_low(bpptr + bpstep*3 + 4); - z40 += v_load(bpptr + bpstep*4); - z41 += v_load_low(bpptr + bpstep*4 + 4); - z50 += v_load(bpptr + bpstep*5); - z51 += v_load_low(bpptr + bpstep*5 + 4); + z00 = v_add(z00, v_load(bpptr)); + z01 = v_add(z01, v_load_low(bpptr + 4)); + z10 = v_add(z10, v_load(bpptr + bpstep)); + z11 = v_add(z11, v_load_low(bpptr + bpstep + 4)); + z20 = v_add(z20, v_load(bpptr + bpstep * 2)); + z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4)); + z30 = v_add(z30, v_load(bpptr + bpstep * 3)); + z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4)); + z40 = v_add(z40, v_load(bpptr + bpstep * 4)); + z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4)); + z50 = v_add(z50, v_load(bpptr + bpstep * 5)); + z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4)); } if (ifMinMaxAct) diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 815bc2dda4..2a2245b909 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -370,10 +370,10 @@ struct ReLUFunctor : public BaseFunctor v_float32x4 x1 = v_load(srcptr + i + 4); v_float32x4 x2 = v_load(srcptr + i + 8); v_float32x4 x3 = v_load(srcptr + i + 12); - x0 = v_select(x0 >= z, x0, x0*s4); - x1 = v_select(x1 >= z, x1, x1*s4); - x2 = v_select(x2 >= z, x2, x2*s4); - x3 = v_select(x3 >= z, x3, x3*s4); + x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4)); + x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4)); + x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4)); + x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4)); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); @@ -2493,10 +2493,10 @@ struct ChannelsPReLUFunctor : public BaseFunctor v_float32x4 x1 = v_load(srcptr + i + 4); v_float32x4 x2 = v_load(srcptr + i + 8); v_float32x4 x3 = v_load(srcptr + i + 12); - x0 = v_select(x0 >= z, x0, x0*s4); - x1 = v_select(x1 >= z, x1, x1*s4); - x2 = v_select(x2 >= z, x2, x2*s4); - x3 = v_select(x3 >= z, x3, x3*s4); + x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4)); + x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4)); + x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4)); + x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4)); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); @@ -2649,10 +2649,10 @@ struct PReLUFunctor : public ChannelsPReLUFunctor v_float32x4 s1 = v_load(scaleptr + i + 4); v_float32x4 s2 = v_load(scaleptr + i + 8); v_float32x4 s3 = v_load(scaleptr + i + 12); - x0 = v_select(x0 >= z, x0, x0*s0); - x1 = v_select(x1 >= z, x1, x1*s1); - x2 = v_select(x2 >= z, x2, x2*s2); - x3 = v_select(x3 >= z, x3, x3*s3); + x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s0)); + x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s1)); + x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s2)); + x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s3)); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index f03af7c1fb..1c27043f1a 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -308,7 +308,7 @@ public: } v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3); - s += v_load(biasptr + i); + s = v_add(s, v_load(biasptr + i)); v_store(dptr + i, s); } #endif diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index a75382d8a5..fb980c4152 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -898,25 +898,25 @@ public: v_float32x4 max_idx0 = v_setall_f32(-1.f); v_float32x4 max_idx1 = max_idx0; int index0 = ystart * inp_width + xstart; - v_float32x4 idx0 = idx00 + v_setall_f32((float)index0); - v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4)); + v_float32x4 idx0 = v_add(idx00, v_setall_f32((float)index0)); + v_float32x4 idx1 = v_add(idx0, v_setall_f32((float)(stride_w * 4))); for (int y = ystart; y < yend; ++y) { - for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones) + for (int x = xstart; x < xend; ++x, idx0 = v_add(idx0, ones), idx1 = v_add(idx1, ones)) { const int index = y * inp_width + x; v_float32x4 v0(srcData[index], srcData[index + stride_w], srcData[index + stride_w*2], srcData[index + stride_w*3]); v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5], srcData[index + stride_w*6], srcData[index + stride_w*7]); - max_idx0 = v_select(v0 > max_val0, idx0, max_idx0); - max_idx1 = v_select(v1 > max_val1, idx1, max_idx1); + max_idx0 = v_select(v_gt(v0, max_val0), idx0, max_idx0); + max_idx1 = v_select(v_gt(v1, max_val1), idx1, max_idx1); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } - idx0 += idx_delta; - idx1 += idx_delta; + idx0 = v_add(idx0, idx_delta); + idx1 = v_add(idx1, idx_delta); } v_store(dstData + x0, max_val0); v_store(dstData + x0 + 4, max_val1); @@ -1069,12 +1069,12 @@ public: srcData[index + stride_w*2], srcData[index + stride_w*3]); v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5], srcData[index + stride_w*6], srcData[index + stride_w*7]); - sum_val0 += v0; - sum_val1 += v1; + sum_val0 = v_add(sum_val0, v0); + sum_val1 = v_add(sum_val1, v1); } } - v_store(dstData + x0, sum_val0*ikarea); - v_store(dstData + x0 + 4, sum_val1*ikarea); + v_store(dstData + x0, v_mul(sum_val0, ikarea)); + v_store(dstData + x0 + 4, v_mul(sum_val1, ikarea)); x0 += 7; } else diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index 163f02717e..cb088eb535 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -120,8 +120,8 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo for (; j < img.cols - 16 - 3; j += 16, ptr += 16) { v_uint8x16 v = v_load(ptr); - v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); - v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); + v_int8x16 v0 = v_reinterpret_as_s8(v_xor(v_add(v, t), delta)); + v_int8x16 v1 = v_reinterpret_as_s8(v_xor(v_sub(v, t), delta)); v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); @@ -129,15 +129,15 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); v_int8x16 m0, m1; - m0 = (v0 < x0) & (v0 < x1); - m1 = (x0 < v1) & (x1 < v1); - m0 = m0 | ((v0 < x1) & (v0 < x2)); - m1 = m1 | ((x1 < v1) & (x2 < v1)); - m0 = m0 | ((v0 < x2) & (v0 < x3)); - m1 = m1 | ((x2 < v1) & (x3 < v1)); - m0 = m0 | ((v0 < x3) & (v0 < x0)); - m1 = m1 | ((x3 < v1) & (x0 < v1)); - m0 = m0 | m1; + m0 = v_and(v_lt(v0, x0), v_lt(v0, x1)); + m1 = v_and(v_lt(x0, v1), v_lt(x1, v1)); + m0 = v_or(m0, v_and(v_lt(v0, x1), v_lt(v0, x2))); + m1 = v_or(m1, v_and(v_lt(x1, v1), v_lt(x2, v1))); + m0 = v_or(m0, v_and(v_lt(v0, x2), v_lt(v0, x3))); + m1 = v_or(m1, v_and(v_lt(x2, v1), v_lt(x3, v1))); + m0 = v_or(m0, v_and(v_lt(v0, x3), v_lt(v0, x0))); + m1 = v_or(m1, v_and(v_lt(x3, v1), v_lt(x0, v1))); + m0 = v_or(m0, m1); if( !v_check_any(m0) ) continue; @@ -154,18 +154,18 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo v_uint8x16 max1 = v_setzero_u8(); for( k = 0; k < N; k++ ) { - v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); - m0 = v0 < x; - m1 = x < v1; + v_int8x16 x = v_reinterpret_as_s8(v_xor(v_load((ptr + pixel[k])), delta)); + m0 = v_lt(v0, x); + m1 = v_lt(x, v1); - c0 = v_sub_wrap(c0, m0) & m0; - c1 = v_sub_wrap(c1, m1) & m1; + c0 = v_and(v_sub_wrap(c0, m0), m0); + c1 = v_and(v_sub_wrap(c1, m1), m1); max0 = v_max(max0, v_reinterpret_as_u8(c0)); max1 = v_max(max1, v_reinterpret_as_u8(c1)); } - max0 = K16 < v_max(max0, max1); + max0 = v_lt(K16, v_max(max0, max1)); unsigned int m = v_signmask(v_reinterpret_as_s8(max0)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) @@ -190,7 +190,7 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo a1 = v_min(a1, v_nms); b1 = v_max(b1, v_nms); } - curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1); + curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_sub(v_setzero_s16(), v_min(b0, b1)))) - 1); } } } diff --git a/modules/features2d/src/fast_score.cpp b/modules/features2d/src/fast_score.cpp index 0bc011af49..0c43ad5552 100644 --- a/modules/features2d/src/fast_score.cpp +++ b/modules/features2d/src/fast_score.cpp @@ -160,7 +160,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold) q0 = v_max(q0, v_min(a, v0)); q1 = v_min(q1, v_max(b, v0)); } - q0 = v_max(q0, v_setzero_s16() - q1); + q0 = v_max(q0, v_sub(v_setzero_s16(), q1)); threshold = v_reduce_max(q0) - 1; } else @@ -251,7 +251,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) q0 = v_max(q0, v_min(a, v0)); q1 = v_min(q1, v_max(b, v0)); } - q0 = v_max(q0, v_setzero_s16() - q1); + q0 = v_max(q0, v_sub(v_setzero_s16(), q1)); threshold = v_reduce_max(q0) - 1; } else @@ -323,7 +323,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) v0 = v_load(d + 5); q0 = v_max(q0, v_min(a, v0)); q1 = v_min(q1, v_max(b, v0)); - q0 = v_max(q0, v_setzero_s16() - q1); + q0 = v_max(q0, v_sub(v_setzero_s16(), q1)); threshold = v_reduce_max(q0) - 1; } else diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index 6c517b1f57..927f08d30a 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -335,7 +335,7 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[], // divide and calculate s according to above feature v_uint32x4 ss[4]; - v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1); + v_uint32x4 vadd = v_shl(v_setall_u32(1), (hsv_shift - 1)); v_uint32x4 v_diff_exp[4]; v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1)); @@ -406,16 +406,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[], // start computing H-ch //h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff)))); v_int32x4 hh[4]; - hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]), + hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(v_sub(gg[0], bb[0])), v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))), v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0])))))); - hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]), + hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(v_sub(gg[1], bb[1])), v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))), v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1])))))); - hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]), + hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(v_sub(gg[2], bb[2])), v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))), v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2])))))); - hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]), + hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(v_sub(gg[3], bb[3])), v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))), v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3])))))); @@ -433,16 +433,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[], // check for negative H v_int32x4 v_h_less_0[4]; - v_h_less_0[0] = (hh[0] < v_setall_s32(0)); - v_h_less_0[1] = (hh[1] < v_setall_s32(0)); - v_h_less_0[2] = (hh[2] < v_setall_s32(0)); - v_h_less_0[3] = (hh[3] < v_setall_s32(0)); + v_h_less_0[0] = (v_lt(hh[0], v_setall_s32(0))); + v_h_less_0[1] = (v_lt(hh[1], v_setall_s32(0))); + v_h_less_0[2] = (v_lt(hh[2], v_setall_s32(0))); + v_h_less_0[3] = (v_lt(hh[3], v_setall_s32(0))); v_int32x4 v_h_180[4]; - v_h_180[0] = hh[0] + v_setall_s32(180); - v_h_180[1] = hh[1] + v_setall_s32(180); - v_h_180[2] = hh[2] + v_setall_s32(180); - v_h_180[3] = hh[3] + v_setall_s32(180); + v_h_180[0] = v_add(hh[0], v_setall_s32(180)); + v_h_180[1] = v_add(hh[1], v_setall_s32(180)); + v_h_180[2] = v_add(hh[2], v_setall_s32(180)); + v_h_180[3] = v_add(hh[3], v_setall_s32(180)); hh[0] = v_select(v_h_less_0[0], v_h_180[0], hh[0]); hh[1] = v_select(v_h_less_0[1], v_h_180[1], hh[1]); diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp index e246f0613b..f7a502f150 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp @@ -64,7 +64,7 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[], bool xRatioEq1 = inSz.width == outSz.width; bool yRatioEq1 = inSz.height == outSz.height; - constexpr int nlanes = v_float32x8::nlanes; + const int nlanes = VTraits::vlanes(); if (!xRatioEq1 && !yRatioEq1) { diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp index 332b36646c..77e0328678 100644 --- a/modules/imgproc/src/bilateral_filter.simd.hpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -140,9 +140,9 @@ public: #if CV_SIMD128 v_uint32x4 rval = v_setall_u32(sptr[j]); v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)))); wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w); + sum[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(val)), w)); #else int rval = sptr[j]; @@ -407,11 +407,11 @@ public: v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr))); + v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_add(v_add(v_absdiff(b, rb), v_absdiff(g, rg)), v_absdiff(r, rr))))); wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w); - sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w); - sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w); + sum_b[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(b)), w)); + sum_g[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(g)), w)); + sum_r[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(r)), w)); #else int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; @@ -661,12 +661,12 @@ public: v_float32x4 rval = v_setall_f32(sptr[j]); v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); v_float32x4 knan = v_not_nan(val); - v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan; + v_float32x4 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex4), v_not_nan(rval)), knan); v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; + alpha = v_sub(alpha, v_cvt_f32(idx)); + v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan); wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum((val & knan) * w); + sum[j] += v_reduce_sum(v_mul(v_and(val, knan), w)); #else float rval = sptr[j]; @@ -862,15 +862,15 @@ public: v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + v_float32x4 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr)); + v_float32x4 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex4), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan); v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; + alpha = v_sub(alpha, v_cvt_f32(idx)); + v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan); wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum((kb & knan) * w); - sum_g[j] += v_reduce_sum((kg & knan) * w); - sum_r[j] += v_reduce_sum((kr & knan) * w); + sum_b[j] += v_reduce_sum(v_mul(v_and(kb, knan), w)); + sum_g[j] += v_reduce_sum(v_mul(v_and(kg, knan), w)); + sum_r[j] += v_reduce_sum(v_mul(v_and(kr, knan), w)); #else float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp index 735935c04f..f7c8f66a35 100644 --- a/modules/imgproc/src/box_filter.simd.hpp +++ b/modules/imgproc/src/box_filter.simd.hpp @@ -315,7 +315,7 @@ struct ColumnSum : v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -357,10 +357,10 @@ struct ColumnSum : } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale))); v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale))); @@ -369,7 +369,7 @@ struct ColumnSum : v_pack_store(D + i, v_dst); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -396,16 +396,16 @@ struct ColumnSum : v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); v_pack_store(D + i, v_dst); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -486,7 +486,7 @@ public BaseColumnFilter v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -546,13 +546,13 @@ public BaseColumnFilter v_uint32x4 ds4 = v_setall_u32((unsigned)ds); v_uint16x8 dd8 = v_setall_u16((ushort)dd); - for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_uint16x8 _sm0 = v_load(Sm + i); - v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes); + v_uint16x8 _sm1 = v_load(Sm + i + VTraits::vlanes()); v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); - v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes)); + v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_uint32x4 _s00, _s01, _s10, _s11; @@ -572,7 +572,7 @@ public BaseColumnFilter v_store(D + i, v_pack_u(r0, r1)); v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16x8::nlanes, _s1); + v_store(SUM + i + VTraits::vlanes(), _s1); } #endif #endif @@ -649,7 +649,7 @@ struct ColumnSum : v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -689,17 +689,17 @@ struct ColumnSum : } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale)); v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale)); v_store(D + i, v_pack(v_s0d, v_s01d)); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -725,15 +725,15 @@ struct ColumnSum : v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_store(D + i, v_pack(v_s0, v_s01)); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -798,7 +798,7 @@ struct ColumnSum : v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -838,17 +838,17 @@ struct ColumnSum : } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale))); v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale))); v_store(D + i, v_pack(v_s0d, v_s01d)); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -874,15 +874,15 @@ struct ColumnSum : v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); - v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits::vlanes()), v_load(Sp + i + VTraits::vlanes())); v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); - v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits::vlanes()))); } #endif #endif @@ -945,7 +945,7 @@ struct ColumnSum : v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -981,7 +981,7 @@ struct ColumnSum : } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale)); @@ -1010,7 +1010,7 @@ struct ColumnSum : v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); @@ -1079,7 +1079,7 @@ struct ColumnSum : v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } @@ -1115,7 +1115,7 @@ struct ColumnSum : } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale)); @@ -1142,7 +1142,7 @@ struct ColumnSum : v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); v_store(D + i, v_cvt_f32(v_s0)); diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index d111efdc47..6a8a0ea7f9 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -66,7 +66,7 @@ template static inline cv::v_float32 splineInterpolate(const cv::v ix = v_shl<2>(ix); v_float32 t0, t1, t2, t3; - // assume that v_float32::nlanes == v_int32::nlanes + // assume that VTraits::vlanes() == VTraits::vlanes() if(VTraits::vlanes() == 4) { int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4]; @@ -1388,16 +1388,16 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC) { //LUT idx of origin pt of cube - v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift); - v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift); - v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift); + v_uint16x8 idxsX = v_shr(inX); + v_uint16x8 idxsY = v_shr(inY); + v_uint16x8 idxsZ = v_shr(inZ); //x, y, z are [0; TRILINEAR_BASE) const uint16_t bitMask = (1 << trilinear_shift) - 1; v_uint16x8 bitMaskReg = v_setall_u16(bitMask); - v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg; - v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg; - v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg; + v_uint16x8 fracX = v_and(v_shr(inX), bitMaskReg); + v_uint16x8 fracY = v_and(v_shr(inY), bitMaskReg); + v_uint16x8 fracZ = v_and(v_shr(inZ), bitMaskReg); //load values to interpolate for pix0, pix1, .., pix7 v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7; @@ -1407,9 +1407,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11; v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1); v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11); - addrDw0 += addrDw10; addrDw1 += addrDw11; + addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11); v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11); - addrDw0 += addrDw10; addrDw1 += addrDw11; + addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11); uint32_t CV_DECL_ALIGNED(16) addrofs[8]; v_store_aligned(addrofs, addrDw0); @@ -1431,9 +1431,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7; v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1); v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11); - addrDw0 += addrDw10; addrDw1 += addrDw11; + addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11); v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11); - addrDw0 += addrDw10; addrDw1 += addrDw11; + addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11); v_store_aligned(addrofs, addrDw0); v_store_aligned(addrofs + 4, addrDw1); @@ -1476,7 +1476,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 const int16_t* LUT, v_uint16& outA, v_uint16& outB, v_uint16& outC) { - const int vsize = VTraits::max_nlanes; + const int vsize = VTraits::vlanes(); + const int vsize_max = VTraits::max_nlanes; // LUT idx of origin pt of cube v_uint16 tx = v_shr(inX); @@ -1492,7 +1493,7 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20); baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21); - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize_max]; v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0); v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1); @@ -1513,13 +1514,13 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0)); trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1)); - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize_max]; v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0); v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1); v_uint32 a0, a1, b0, b1, c0, c1; - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize_max], vb[vsize_max], vc[vsize_max]; for(int j = 0; j < vsize; j++) { const int16_t* baseLUT = LUT + vbaseIdx[j]; @@ -1649,11 +1650,11 @@ struct RGB2Lab_b vL = v_shr(vL); /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/ - va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift)); + va = v_fma(v_sub(vfX, vfY), v_setall_s32(500), v_setall_s32(abShift+labDescaleShift)); va = v_shr(va); /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/ - vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift)); + vb = v_fma(v_sub(vfY, vfZ), v_setall_s32(200), v_setall_s32(abShift+labDescaleShift)); vb = v_shr(vb); } #endif // CV_NEON @@ -1675,8 +1676,8 @@ struct RGB2Lab_b #if CV_NEON // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of // Lab v_uint8s - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes, - src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes ) + for(; i <= n - VTraits::vlanes(); i += VTraits::vlanes(), + src += scn*VTraits::vlanes(), dst += 3*VTraits::vlanes() ) { // Load 4 batches of 4 src v_uint8 vRi, vGi, vBi; @@ -1712,7 +1713,7 @@ struct RGB2Lab_b #endif // CV_NEON #if CV_SIMD - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); const int xyzDescaleShift = 1 << (lab_shift - 1); v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift); v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1; @@ -1752,7 +1753,7 @@ struct RGB2Lab_b v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]); } - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[VTraits::max_nlanes*3]; for(int k = 0; k < 12; k++) { v_store_aligned(vdrgb + k*vsize/4, qrgb[k]); @@ -1784,14 +1785,14 @@ struct RGB2Lab_b v_uint32 x[4], y[4], z[4]; for(int j = 0; j < 4; j++) { - x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift; - y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift; - z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift; + x[j] = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cxrg), v_dotprod(bd[j], cxb1)))); + y[j] = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cyrg), v_dotprod(bd[j], cyb1)))); + z[j] = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], czrg), v_dotprod(bd[j], czb1)))); } // [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz] // [4 per X, 4 per Y, 4 per Z] - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[VTraits::max_nlanes*3]; for(int j = 0; j < 4; j++) { v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]); @@ -1822,7 +1823,7 @@ struct RGB2Lab_b v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift)); for(int k = 0; k < 4; k++) { - vL[k] = (vL[k] + vLshift) >> lab_shift2; + vL[k] = v_shr(v_add(vL[k], vLshift)); } v_uint16 L0, L1; L0 = v_pack(vL[0], vL[1]); @@ -1846,7 +1847,7 @@ struct RGB2Lab_b v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift); for(int k = 0; k < 8; k++) { - ab[k] = (ab[k] + abShift) >> lab_shift2; + ab[k] = v_shr(v_add(ab[k], abShift)); } v_int16 a0, a1, b0, b1; a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]); @@ -1941,7 +1942,7 @@ struct RGB2Lab_f #if CV_SIMD if(enablePackedLab) { - const int vsize = v_float32::nlanes; + const int vsize = VTraits::vlanes(); static const int nPixels = vsize*2; for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels) { @@ -1973,8 +1974,8 @@ struct RGB2Lab_f #undef clipv /* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */ v_float32 basef = vx_setall_f32(LAB_BASE); - rvec0 *= basef, gvec0 *= basef, bvec0 *= basef; - rvec1 *= basef, gvec1 *= basef, bvec1 *= basef; + rvec0 = v_mul(rvec0, basef), gvec0 = v_mul(gvec0, basef), bvec0 = v_mul(bvec0, basef); + rvec1 = v_mul(rvec1, basef), gvec1 = v_mul(gvec1, basef), bvec1 = v_mul(bvec1, basef); v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1; irvec0 = v_round(rvec0); irvec1 = v_round(rvec1); @@ -2004,8 +2005,8 @@ struct RGB2Lab_f /* dst[i] = L*100.0f */ v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE); - l_vec0 = l_vec0*v100dBase; - l_vec1 = l_vec1*v100dBase; + l_vec0 = v_mul(l_vec0, v100dBase); + l_vec1 = v_mul(l_vec1, v100dBase); /* dst[i + 1] = a*256.0f - 128.0f; dst[i + 2] = b*256.0f - 128.0f; @@ -2043,8 +2044,8 @@ struct RGB2Lab_f static const float _a = (softfloat(16) / softfloat(116)); int i = 0; #if CV_SIMD - const int vsize = v_float32::nlanes; - const int nrepeats = vsize == 4 ? 2 : 1; + const int vsize = VTraits::vlanes(); + const int nrepeats = VTraits::nlanes == 4 ? 2 : 1; v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5); v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8); @@ -2080,9 +2081,9 @@ struct RGB2Lab_f v_float32 vgscale = vx_setall_f32(gscale); for (int k = 0; k < nrepeats; k++) { - R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE); - G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE); - B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE); + R[k] = splineInterpolate(v_mul(R[k], vgscale), gammaTab, GAMMA_TAB_SIZE); + G[k] = splineInterpolate(v_mul(G[k], vgscale), gammaTab, GAMMA_TAB_SIZE); + B[k] = splineInterpolate(v_mul(B[k], vgscale), gammaTab, GAMMA_TAB_SIZE); } } @@ -2090,26 +2091,26 @@ struct RGB2Lab_f v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats]; for (int k = 0; k < nrepeats; k++) { - X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2)); - Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5)); - Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8)); + X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2))); + Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5))); + Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8))); // use spline interpolation instead of direct calculation v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale); - FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); - FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); - FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); + FX[k] = splineInterpolate(v_mul(X[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE); + FY[k] = splineInterpolate(v_mul(Y[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE); + FZ[k] = splineInterpolate(v_mul(Z[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE); } v_float32 L[nrepeats], a[nrepeats], b[nrepeats]; for (int k = 0; k < nrepeats; k++) { // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3 - v_float32 mask = Y[k] > (vx_setall_f32(0.008856f)); + v_float32 mask = v_gt(Y[k], (vx_setall_f32(0.008856f))); v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f); - L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]); - a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]); - b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]); + L[k] = v_select(mask, v_fma(v116, FY[k], vm16), v_mul(vx_setall_f32(903.3f),Y[k])); + a[k] = v_mul(vx_setall_f32(500.F), v_sub(FX[k], FY[k])); + b[k] = v_mul(vx_setall_f32(200.F), v_sub(FY[k], FZ[k])); v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]); } @@ -2204,7 +2205,7 @@ struct Lab2RGBfloat float alpha = ColorChannel::max(); #if CV_SIMD - const int vsize = v_float32::nlanes; + const int vsize = VTraits::vlanes(); const int nrepeats = 2; v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f); for( ; i <= n-vsize*nrepeats; @@ -2221,14 +2222,14 @@ struct Lab2RGBfloat v_float32 vlThresh = vx_setall_f32(lThresh); for(int k = 0; k < nrepeats; k++) { - limask[k] = li[k] <= vlThresh; + limask[k] = v_le(li[k], vlThresh); } v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats]; // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4) v_float32 vinv903 = vx_setall_f32(1.f/903.3f); for(int k = 0; k < nrepeats; k++) { - ylo[k] = li[k] * vinv903; + ylo[k] = v_mul(li[k], vinv903); } v_float32 v7787 = vx_setall_f32(7.787f); for(int k = 0; k < nrepeats; k++) @@ -2238,11 +2239,11 @@ struct Lab2RGBfloat v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f); for(int k = 0; k < nrepeats; k++) { - fyhi[k] = (li[k] + v16) * vinv116; + fyhi[k] = v_mul(v_add(li[k], v16), vinv116); } for(int k = 0; k < nrepeats; k++) { - yhi[k] = fyhi[k] * fyhi[k] * fyhi[k]; + yhi[k] = v_mul(fyhi[k], fyhi[k], fyhi[k]); } for(int k = 0; k < nrepeats; k++) { @@ -2265,9 +2266,9 @@ struct Lab2RGBfloat for (int j = 0; j < 2; j++) { v_float32 f = fxz[k*2+j]; - v_float32 fmask = f <= vfTresh; - v_float32 flo = (f - v16_116) * vinv7787; - v_float32 fhi = f*f*f; + v_float32 fmask = v_le(f, vfTresh); + v_float32 flo = v_mul(v_sub(f, v16_116), vinv7787); + v_float32 fhi = v_mul(v_mul(f, f), f); fxz[k*2+j] = v_select(fmask, flo, fhi); } } @@ -2281,9 +2282,9 @@ struct Lab2RGBfloat v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8); for(int k = 0; k < nrepeats; k++) { - ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k])); - go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k])); - bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k])); + ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], v_mul(vc2, z[k]))); + go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], v_mul(vc5, z[k]))); + bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], v_mul(vc8, z[k]))); } v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32(); for(int k = 0; k < nrepeats; k++) @@ -2298,9 +2299,9 @@ struct Lab2RGBfloat v_float32 vgscale = vx_setall_f32(gscale); for(int k = 0; k < nrepeats; k++) { - ro[k] *= vgscale; - go[k] *= vgscale; - bo[k] *= vgscale; + ro[k] = v_mul(ro[k], vgscale); + go[k] = v_mul(go[k], vgscale); + bo[k] = v_mul(bo[k], vgscale); } for(int k = 0; k < nrepeats; k++) @@ -2500,8 +2501,8 @@ struct Lab2RGBinteger for(int k = 0; k < 4; k++) { yf[k] = v_lut((const int*)LabToYF_b, lq[k]); - y[k] = yf[k] & mask16; - ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16); + y[k] = v_and(yf[k], mask16); + ify[k] = v_reinterpret_as_s32(v_shr(v_reinterpret_as_u32(yf[k]), 16)); } v_int16 ify0, ify1; @@ -2516,18 +2517,18 @@ struct Lab2RGBinteger v_uint16 mulA = vx_setall_u16(53687); v_uint32 ma[4]; v_uint32 addA = vx_setall_u32(1 << 7); - v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]); - v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]); - adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13))); - adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13))); + v_mul_expand((v_add(a0, v_shl<2>(a0))), mulA, ma[0], ma[1]); + v_mul_expand((v_add(a1, v_shl<2>(a1))), mulA, ma[2], ma[3]); + adiv0 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[0], addA))), (v_shr<13>(v_add(ma[1], addA))))); + adiv1 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[2], addA))), (v_shr<13>(v_add(ma[3], addA))))); v_uint16 mulB = vx_setall_u16(41943); v_uint32 mb[4]; v_uint32 addB = vx_setall_u32(1 << 4); v_mul_expand(b0, mulB, mb[0], mb[1]); v_mul_expand(b1, mulB, mb[2], mb[3]); - bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9)); - bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9)); + bdiv0 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[0], addB)), v_shr<9>(v_add(mb[1], addB)))); + bdiv1 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[2], addB)), v_shr<9>(v_add(mb[3], addB)))); // 0 <= adiv <= 8356, 0 <= bdiv <= 20890 /* x = ifxz[0]; y = y; z = ifxz[1]; */ @@ -2570,7 +2571,7 @@ struct Lab2RGBinteger { bool srgb = issRGB; ushort* tab = sRGBInvGammaTab_b; - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); v_uint8 valpha = vx_setall_u8(alpha); v_int32 vc[9]; for(int k = 0; k < 9; k++) @@ -2592,9 +2593,9 @@ struct Lab2RGBinteger v_int32 rq[4], gq[4], bq[4]; for(int k = 0; k < 4; k++) { - rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift; - gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift; - bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift; + rq[k] = v_shr(v_add(v_add(v_add(v_mul(vc[0], xq[k]), v_mul(vc[1], yq[k])), v_mul(vc[2], zq[k])), vdescale)); + gq[k] = v_shr(v_add(v_add(v_add(v_mul(vc[3], xq[k]), v_mul(vc[4], yq[k])), v_mul(vc[5], zq[k])), vdescale)); + bq[k] = v_shr(v_add(v_add(v_add(v_mul(vc[6], xq[k]), v_mul(vc[7], yq[k])), v_mul(vc[8], zq[k])), vdescale)); } //limit indices in table and then substitute @@ -2611,7 +2612,7 @@ struct Lab2RGBinteger if(srgb) { // [RRR... , GGG... , BBB...] - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[VTraits::max_nlanes*3]; for (int k = 0; k < 4; k++) v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]); for (int k = 0; k < 4; k++) @@ -2631,9 +2632,9 @@ struct Lab2RGBinteger // rgb = (rgb*255) >> inv_gamma_shift for(int k = 0; k < 4; k++) { - rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift; - gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift; - bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift; + rq[k] = v_shr((v_sub(v_shl(rq[k], 8), rq[k])), inv_gamma_shift); + gq[k] = v_shr((v_sub(v_shl(gq[k], 8), gq[k])), inv_gamma_shift); + bq[k] = v_shr((v_sub(v_shl(bq[k], 8), bq[k])), inv_gamma_shift); } rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1])); rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3])); @@ -2730,13 +2731,13 @@ struct Lab2RGB_b static const softfloat fl = softfloat(100)/f255; #if CV_SIMD - const int fsize = v_float32::nlanes; + const int fsize = VTraits::vlanes(); v_float32 vl = vx_setall_f32((float)fl); v_float32 va = vx_setall_f32(1.f); v_float32 vb = vx_setall_f32(1.f); v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f); //TODO: fix that when v_interleave is available - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits::max_nlanes*3], interTmpA[VTraits::max_nlanes*3]; v_store_interleave(interTmpM, vl, va, vb); v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow); v_float32 mluv[3], aluv[3]; @@ -2754,7 +2755,7 @@ struct Lab2RGB_b j = 0; #if CV_SIMD - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); for( ; j <= (dn - vsize)*3; j += 3*vsize ) { v_uint8 s0, s1, s2; @@ -2808,7 +2809,7 @@ struct Lab2RGB_b v_int32 vi[4*3]; for(int k = 0; k < 4*3; k++) { - vi[k] = v_round(vf[k]*v255); + vi[k] = v_round(v_mul(vf[k], v255)); } v_uint8 rgb[3]; @@ -2830,7 +2831,7 @@ struct Lab2RGB_b for(int k = 0; k < 4; k++) { vf[k] = vx_load_aligned(buf + j + k*fsize); - vi[k] = v_round(vf[k]*v255); + vi[k] = v_round(v_mul(vf[k], v255)); } v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3]))); } @@ -2910,8 +2911,8 @@ struct RGB2Luvfloat C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; #if CV_SIMD - const int vsize = v_float32::nlanes; - const int nrepeats = vsize == 4 ? 2 : 1; + const int vsize = VTraits::vlanes(); + const int nrepeats = VTraits::nlanes == 4 ? 2 : 1; for( ; i <= n-vsize*nrepeats; i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats) { @@ -2944,9 +2945,9 @@ struct RGB2Luvfloat v_float32 vgscale = vx_setall_f32(gscale); for (int k = 0; k < nrepeats; k++) { - R[k] *= vgscale; - G[k] *= vgscale; - B[k] *= vgscale; + R[k] = v_mul(R[k], vgscale); + G[k] = v_mul(G[k], vgscale); + B[k] = v_mul(B[k], vgscale); } for (int k = 0; k < nrepeats; k++) @@ -2963,27 +2964,27 @@ struct RGB2Luvfloat v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8); for (int k = 0; k < nrepeats; k++) { - X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2)); - Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5)); - Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8)); + X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2))); + Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5))); + Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8))); } v_float32 L[nrepeats], u[nrepeats], v[nrepeats]; v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn); for (int k = 0; k < nrepeats; k++) { - L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE); + L[k] = splineInterpolate(v_mul(Y[k], vx_setall_f32(LabCbrtTabScale)), LabCbrtTab, LAB_CBRT_TAB_SIZE); // L = 116.f*L - 16.f; L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f)); v_float32 d; // d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON) d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k])); - d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON)); + d = v_div(vx_setall_f32(4.F * 13.F), v_max(d, vx_setall_f32(FLT_EPSILON))); // u = L*(X*d - un) - u[k] = L[k]*v_fma(X[k], d, vmun); + u[k] = v_mul(L[k], v_fma(X[k], d, vmun)); // v = L*((9*0.25f)*Y*d - vn); - v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn); + v[k] = v_mul(L[k], v_fma(v_mul(vx_setall_f32(9.F * 0.25F), Y[k]), d, vmvn)); } for (int k = 0; k < nrepeats; k++) @@ -3099,8 +3100,8 @@ struct Luv2RGBfloat float _un = un, _vn = vn; #if CV_SIMD - const int vsize = v_float32::nlanes; - const int nrepeats = vsize == 4 ? 2 : 1; + const int vsize = VTraits::vlanes(); + const int nrepeats = VTraits::nlanes == 4 ? 2 : 1; for( ; i <= n - vsize*nrepeats; i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats) { @@ -3120,13 +3121,13 @@ struct Luv2RGBfloat v_float32 Ylo, Yhi; // ((L + 16)/116)^3 - Ylo = (L[k] + v16) * v116inv; - Ylo = Ylo*Ylo*Ylo; + Ylo = v_mul(v_add(L[k], v16), v116inv); + Ylo = v_mul(v_mul(Ylo, Ylo), Ylo); // L*(3./29.)^3 - Yhi = L[k] * v903inv; + Yhi = v_mul(L[k], v903inv); // Y = (L <= 8) ? Y0 : Y1; - Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi); + Y[k] = v_select(v_ge(L[k], vx_setall_f32(8.f)), Ylo, Yhi); } v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f); @@ -3135,18 +3136,18 @@ struct Luv2RGBfloat v_float32 up, vp; // up = 3*(u + L*_un); - up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k])); + up = v_mul(v3, v_fma(L[k], vx_setall_f32(_un), u[k])); // vp = 0.25/(v + L*_vn); - vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k])); + vp = v_div(v4inv, v_fma(L[k], vx_setall_f32(_vn), v[k])); // vp = max(-0.25, min(0.25, vp)); vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp)); //X = 3*up*vp; // (*Y) is done later - X[k] = v3*up*vp; + X[k] = v_mul(v_mul(v3, up), vp); //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later // xor flips the sign, works like unary minus - Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f)); + Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (v_xor(vx_setall_f32(-0.F), up))), vp, vx_setall_f32(-5.f)); } v_float32 R[nrepeats], G[nrepeats], B[nrepeats]; @@ -3156,9 +3157,9 @@ struct Luv2RGBfloat for(int k = 0; k < nrepeats; k++) { // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done - R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k]; - G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k]; - B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k]; + R[k] = v_mul(v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1)), Y[k]); + G[k] = v_mul(v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4)), Y[k]); + B[k] = v_mul(v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7)), Y[k]); } v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f); @@ -3174,9 +3175,9 @@ struct Luv2RGBfloat v_float32 vgscale = vx_setall_f32(gscale); for(int k = 0; k < nrepeats; k++) { - R[k] *= vgscale; - G[k] *= vgscale; - B[k] *= vgscale; + R[k] = v_mul(R[k], vgscale); + G[k] = v_mul(G[k], vgscale); + B[k] = v_mul(B[k], vgscale); } for(int k = 0; k < nrepeats; k++) { @@ -3285,7 +3286,7 @@ struct RGB2Luvinterpolate #if CV_SIMD if(enablePackedRGB2Luv) { - const int vsize = v_uint16::nlanes; + const int vsize = VTraits::vlanes(); static const int nPixels = vsize*2; for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels) { @@ -3315,9 +3316,9 @@ struct RGB2Luvinterpolate v_expand(r, r0, r1); v_expand(g, g0, g1); v_expand(b, b0, b1); - r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8); - g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8); - b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8); + r0 = v_shl(r0); r1 = v_shl(r1); + g0 = v_shl(g0); g1 = v_shl(g1); + b0 = v_shl(b0); b1 = v_shl(b1); /* int L, u, v; @@ -3332,9 +3333,9 @@ struct RGB2Luvinterpolate dst[i+1] = saturate_cast(u/baseDiv); dst[i+2] = saturate_cast(v/baseDiv); */ - l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8); - u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8); - v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8); + l0 = v_shr(l0); l1 = v_shr(l1); + u0 = v_shr(u0); u1 = v_shr(u1); + v0 = v_shr(v0); v1 = v_shr(v1); v_uint8 l = v_pack(l0, l1); v_uint8 u = v_pack(u0, u1); v_uint8 v = v_pack(v0, v1); @@ -3405,12 +3406,12 @@ struct RGB2Luv_b static const softfloat su = -uLow*f255/uRange; static const softfloat sv = -vLow*f255/vRange; #if CV_SIMD - const int fsize = v_float32::nlanes; + const int fsize = VTraits::vlanes(); v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32(); v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su); v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv); //TODO: fix that when v_interleave is available - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits::max_nlanes*3], interTmpA[VTraits::max_nlanes*3]; v_store_interleave(interTmpM, ml, mu, mv); v_store_interleave(interTmpA, al, au, av); v_float32 mluv[3], aluv[3]; @@ -3452,7 +3453,7 @@ struct RGB2Luv_b v_float32 f[3*4]; for(int k = 0; k < 3*4; k++) { - f[k] = v_cvt_f32(q[k])*v255inv; + f[k] = v_mul(v_cvt_f32(q[k]), v255inv); } for(int k = 0; k < 4; k++) @@ -3478,8 +3479,8 @@ struct RGB2Luv_b v_int32 q0, q1; v_expand(v_reinterpret_as_s16(d), q0, q1); - v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv); - v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv); + v_store_aligned(buf + j + 0*fsize, v_mul(v_cvt_f32(q0), v255inv)); + v_store_aligned(buf + j + 1*fsize, v_mul(v_cvt_f32(q1), v255inv)); } for( ; j < dn*bufChannels; j++, src++ ) { @@ -3633,7 +3634,8 @@ struct Luv2RGBinteger inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv, v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const { - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); + const int vsize_max = VTraits::max_nlanes; v_uint16 lv0, lv1; v_expand(lv, lv0, lv1); @@ -3646,7 +3648,7 @@ struct Luv2RGBinteger v_int32 mask16 = vx_setall_s32(0xFFFF); for(int k = 0; k < 4; k++) { - y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16; + y[k] = v_and(v_lut((const int *)LabToYF_b, v_reinterpret_as_s32(lq[k])), mask16); } v_int32 up[4], vp[4]; @@ -3657,10 +3659,10 @@ struct Luv2RGBinteger v_expand(vv, vv0, vv1); // LL*256 v_uint16 ll0, ll1; - ll0 = lv0 << 8; ll1 = lv1 << 8; + ll0 = v_shl<8>(lv0); ll1 = v_shl<8>(lv1); v_uint16 upidx0, upidx1, vpidx0, vpidx1; - upidx0 = ll0 + uv0; upidx1 = ll1 + uv1; - vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1; + upidx0 = v_add(ll0, uv0); upidx1 = v_add(ll1, uv1); + vpidx0 = v_add(ll0, vv0); vpidx1 = v_add(ll1, vv1); v_uint32 upidx[4], vpidx[4]; v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]); v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]); @@ -3672,7 +3674,7 @@ struct Luv2RGBinteger // long long int vpl = LUVLUT.LvToVpl_b[LL*256+v]; v_int64 vpl[8]; - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize_max]; for(int k = 0; k < 4; k++) { v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k])); @@ -3684,12 +3686,13 @@ struct Luv2RGBinteger // not all 64-bit arithmetic is available in univ. intrinsics // need to handle it with scalar code - int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize]; + int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize_max]; for(int k = 0; k < 8; k++) { v_store_aligned(vvpl + k*vsize/8, vpl[k]); } - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize_max], vvp[vsize_max], + vx[vsize_max], vy[vsize_max], vzm[vsize_max]; for(int k = 0; k < 4; k++) { v_store_aligned(vup + k*vsize/4, up[k]); @@ -3724,7 +3727,7 @@ struct Luv2RGBinteger // z = zm/256 + zm/65536; for (int k = 0; k < 4; k++) { - z[k] = (zm[k] >> 8) + (zm[k] >> 16); + z[k] = v_add(v_shr<8>(zm[k]), v_shr<16>(zm[k])); } // (x, z) = clip((x, z), min=0, max=2*BASE) @@ -3751,7 +3754,7 @@ struct Luv2RGBinteger { ushort* tab = sRGBInvGammaTab_b; bool srgb = issRGB; - static const int vsize = v_uint8::nlanes; + static const int vsize = VTraits::vlanes(); const int descaleShift = 1 << (shift-1); v_int16 vdescale = vx_setall_s16(descaleShift); v_int16 vc[9]; @@ -3771,12 +3774,12 @@ struct Luv2RGBinteger // fixing 16bit signed multiplication // by subtracting 2^(base_shift-1) and then adding result back v_int32 dummy32, fm[3]; - v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32); - v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32); - v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32); - fm[0] = fm[0] << (base_shift-1); - fm[1] = fm[1] << (base_shift-1); - fm[2] = fm[2] << (base_shift-1); + v_expand(v_add(vc[0],vc[1],vc[2]), fm[0], dummy32); + v_expand(v_add(vc[3],vc[4],vc[5]), fm[1], dummy32); + v_expand(v_add(vc[6],vc[7],vc[8]), fm[2], dummy32); + fm[0] = v_shl(fm[0], (base_shift-1)); + fm[1] = v_shl(fm[1], (base_shift-1)); + fm[2] = v_shl(fm[2], (base_shift-1)); for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize) { @@ -3816,15 +3819,15 @@ struct Luv2RGBinteger // a bit faster than one loop for all for(int k = 0; k < 4; k++) { - i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift; + i_rgb[k+4*0] = v_shr(v_add(v_add(v_dotprod(xy[k], crxy), v_dotprod(zd[k], crz1)), fm[0])); } for(int k = 0; k < 4; k++) { - i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift; + i_rgb[k+4*1] = v_shr(v_add(v_add(v_dotprod(xy[k], cgxy), v_dotprod(zd[k], cgz1)), fm[1])); } for(int k = 0; k < 4; k++) { - i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift; + i_rgb[k+4*2] = v_shr(v_add(v_add(v_dotprod(xy[k], cbxy), v_dotprod(zd[k], cbz1)), fm[2])); } // [rrggbb] @@ -3842,7 +3845,7 @@ struct Luv2RGBinteger if(srgb) { // [rr.., gg.., bb..] - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*VTraits::max_nlanes]; for(int k = 0; k < 12; k++) { v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]); @@ -3857,7 +3860,7 @@ struct Luv2RGBinteger // rgb = (rgb*255) >> inv_gamma_shift for(int k = 0; k < 12; k++) { - i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift; + i_rgb[k] = v_shr((v_sub((v_shl(i_rgb[k], 8)), i_rgb[k])), inv_gamma_shift); } for(int k = 0; k < 6; k++) @@ -3940,13 +3943,13 @@ struct Luv2RGB_b static const softfloat fv = vRange/f255; #if CV_SIMD - const int fsize = v_float32::nlanes; + const int fsize = VTraits::vlanes(); v_float32 vl = vx_setall_f32((float)fl); v_float32 vu = vx_setall_f32((float)fu); v_float32 vv = vx_setall_f32((float)fv); v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow); //TODO: fix that when v_interleave is available - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits::max_nlanes*3], interTmpA[VTraits::max_nlanes*3]; v_store_interleave(interTmpM, vl, vu, vv); v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow); v_float32 mluv[3], aluv[3]; @@ -3964,7 +3967,7 @@ struct Luv2RGB_b j = 0; #if CV_SIMD - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); for( ; j <= (dn - vsize)*3; j += 3*vsize ) { v_uint8 s0, s1, s2; @@ -4017,7 +4020,7 @@ struct Luv2RGB_b v_int32 vi[4*3]; for(int k = 0; k < 4*3; k++) { - vi[k] = v_round(vf[k]*v255); + vi[k] = v_round(v_mul(vf[k], v255)); } v_uint8 rgb[3]; @@ -4039,7 +4042,7 @@ struct Luv2RGB_b for(int k = 0; k < 4; k++) { vf[k] = vx_load_aligned(buf + j + k*fsize); - vi[k] = v_round(vf[k]*v255); + vi[k] = v_round(v_mul(vf[k], v255)); } v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3]))); } diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp index 67e2febd5b..ca39d8a908 100644 --- a/modules/imgproc/src/color_rgb.simd.hpp +++ b/modules/imgproc/src/color_rgb.simd.hpp @@ -882,7 +882,7 @@ struct RGBA2mRGBA int i = 0; #if CV_SIMD - const int vsize = v_uint8::nlanes; + const int vsize = VTraits::vlanes(); v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); v_uint16 vh = vx_setall_u16(half_val+1); @@ -901,27 +901,27 @@ struct RGBA2mRGBA v_uint16 a16[4]; for(int j = 0; j < 4; j++) - a16[j] = v_reinterpret_as_u16(v[j] & amask); + a16[j] = v_reinterpret_as_u16(v_and(v[j], amask)); v_uint32 a32[4]; for(int j = 0; j < 4; j++) - a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8)); + a32[j] = v_reinterpret_as_u32(v_or(a16[j], (v_shr(a16[j], 8)))); v_uint8 a[4]; for(int j = 0; j < 4; j++) - a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16)); + a[j] = v_reinterpret_as_u8(v_or(a32[j], (v_shr(a32[j], 16)))); v_uint16 m[8]; for(int j = 0; j < 4; j++) v_mul_expand(v[j], a[j], m[j], m[j+4]); for(int j = 0; j < 8; j++) - m[j] += vh; + m[j] = v_add(m[j], vh); // div 255: (v+1+(v>>8))>8 // +1 is in vh, has no effect on (v>>8) for(int j = 0; j < 8; j++) - m[j] = (m[j] + (m[j] >> 8)) >> 8; + m[j] = v_shr((v_add(m[j], (v_shr(m[j], 8)))), 8); v_uint8 d[4]; for(int j = 0; j < 4; j++) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 627c052aea..148df552e4 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -188,21 +188,21 @@ public: v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step)); v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2)); - v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7); - v_uint16x8 b0 = v_rotate_right<1>(b1) + b1; - b1 = v_rotate_right<1>(b1) << 1; + v_uint16x8 b1 = v_add(v_shr<7>(v_shl<8>(r0)), v_shr<7>(v_shl<8>(r2))); + v_uint16x8 b0 = v_add(v_rotate_right<1>(b1), b1); + b1 = v_shl<1>(v_rotate_right<1>(b1)); - v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7); - v_uint16x8 g1 = (r1 << 8) >> 7; - g0 += v_rotate_right<1>(g1) + g1; - g1 = v_rotate_right<1>(g1) << 2; + v_uint16x8 g0 = v_add(v_shr<7>(r0), v_shr<7>(r2)); + v_uint16x8 g1 = v_shr<7>(v_shl<8>(r1)); + g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1)); + g1 = v_shl<2>(v_rotate_right<1>(g1)); - r0 = r1 >> 8; - r1 = (v_rotate_right<1>(r0) + r0) << 2; - r0 = r0 << 3; + r0 = v_shr<8>(r1); + r1 = v_shl<2>(v_add(v_rotate_right<1>(r0), r0)); + r0 = v_shl<3>(r0); - g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2; - g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2; + g0 = v_shr<2>(v_add(v_add(v_mul_hi(b0, _b2y), v_mul_hi(g0, _g2y)), v_mul_hi(r0, _r2y))); + g1 = v_shr<2>(v_add(v_add(v_mul_hi(b1, _b2y), v_mul_hi(g1, _g2y)), v_mul_hi(r1, _r2y))); v_uint8x16 pack_lo, pack_hi; v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)), v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)), @@ -269,31 +269,31 @@ public: v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step)); v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2)); - v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo)); v_uint16x8 nextb1 = v_rotate_right<1>(b1); - v_uint16x8 b0 = b1 + nextb1; - b1 = (nextb1 + delta1) >> 1; - b0 = (b0 + delta2) >> 2; + v_uint16x8 b0 = v_add(b1, nextb1); + b1 = v_shr<1>(v_add(nextb1, delta1)); + b0 = v_shr<2>(v_add(b0, delta2)); // b0 b2 ... b14 b1 b3 ... b15 b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); - v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); - v_uint16x8 g1 = r1 & masklo; - g0 += v_rotate_right<1>(g1) + g1; + v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2)); + v_uint16x8 g1 = v_and(r1, masklo); + g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1)); g1 = v_rotate_right<1>(g1); - g0 = (g0 + delta2) >> 2; + g0 = v_shr<2>(v_add(g0, delta2)); // g0 g2 ... g14 g1 g3 ... g15 g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); - r0 = r1 >> 8; - r1 = v_rotate_right<1>(r0) + r0; - r1 = (r1 + delta1) >> 1; + r0 = v_shr<8>(r1); + r1 = v_add(v_rotate_right<1>(r0), r0); + r1 = v_shr<1>(v_add(r1, delta1)); // r0 r2 ... r14 r1 r3 ... r15 r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); - b1 = (b0 ^ r0) & mask; - b0 = b0 ^ b1; - r0 = r0 ^ b1; + b1 = v_and(v_xor(b0, r0), mask); + b0 = v_xor(b0, b1); + r0 = v_xor(r0, b1); // b1 g1 b3 g3 b5 g5... v_uint8x16 pack_lo, pack_hi; @@ -402,31 +402,31 @@ public: v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step)); v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2)); - v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo)); v_uint16x8 nextb1 = v_rotate_right<1>(b1); - v_uint16x8 b0 = b1 + nextb1; - b1 = (nextb1 + delta1) >> 1; - b0 = (b0 + delta2) >> 2; + v_uint16x8 b0 = v_add(b1, nextb1); + b1 = v_shr<1>(v_add(nextb1, delta1)); + b0 = v_shr<2>(v_add(b0, delta2)); // b0 b2 ... b14 b1 b3 ... b15 b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); - v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); - v_uint16x8 g1 = r1 & masklo; - g0 += v_rotate_right<1>(g1) + g1; + v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2)); + v_uint16x8 g1 = v_and(r1, masklo); + g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1)); g1 = v_rotate_right<1>(g1); - g0 = (g0 + delta2) >> 2; + g0 = v_shr<2>(v_add(g0, delta2)); // g0 g2 ... g14 g1 g3 ... g15 g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); - r0 = r1 >> 8; - r1 = v_rotate_right<1>(r0) + r0; - r1 = (r1 + delta1) >> 1; + r0 = v_shr<8>(r1); + r1 = v_add(v_rotate_right<1>(r0), r0); + r1 = v_shr<1>(v_add(r1, delta1)); // r0 r2 ... r14 r1 r3 ... r15 r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); - b1 = (b0 ^ r0) & mask; - b0 = b0 ^ b1; - r0 = r0 ^ b1; + b1 = v_and(v_xor(b0, r0), mask); + b0 = v_xor(b0, b1); + r0 = v_xor(r0, b1); // b1 g1 b3 g3 b5 g5... v_uint8x16 pack_lo, pack_hi; @@ -498,40 +498,40 @@ public: v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step)); v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2)); - v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow); + v_uint16x8 b1 = v_add(v_and(r0, masklow), v_and(r2, masklow)); v_uint16x8 nextb1 = v_rotate_right<1>(b1); - v_uint16x8 b0 = b1 + nextb1; - b1 = (nextb1 + delta1) >> 1; - b0 = (b0 + delta2) >> 2; + v_uint16x8 b0 = v_add(b1, nextb1); + b1 = v_shr<1>(v_add(nextb1, delta1)); + b0 = v_shr<2>(v_add(b0, delta2)); // b0 b2 ... b14 b1 b3 ... b15 b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); // vertical sum - v_uint16x8 r0g = r0 >> 8; - v_uint16x8 r2g = r2 >> 8; - v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1; + v_uint16x8 r0g = v_shr<8>(r0); + v_uint16x8 r2g = v_shr<8>(r2); + v_uint16x8 sumv = v_shr<1>(v_add(v_add(r0g, r2g), delta1)); // horizontal sum - v_uint16x8 g1 = r1 & masklow; + v_uint16x8 g1 = v_and(r1, masklow); v_uint16x8 nextg1 = v_rotate_right<1>(g1); - v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1; + v_uint16x8 sumg = v_shr<1>(v_add(v_add(g1, nextg1), delta1)); // gradients - v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g); - v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1); - v_uint16x8 gmask = gradg > gradv; - v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full)); + v_uint16x8 gradv = v_add(v_sub(r0g, r2g), v_sub(r2g, r0g)); + v_uint16x8 gradg = v_add(v_sub(nextg1, g1), v_sub(g1, nextg1)); + v_uint16x8 gmask = v_gt(gradg, gradv); + v_uint16x8 g0 = v_add(v_and(gmask, sumv), v_and(sumg, v_xor(gmask, full))); // g0 g2 ... g14 g1 g3 ... g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1))); - r0 = r1 >> 8; - r1 = v_rotate_right<1>(r0) + r0; - r1 = (r1 + delta1) >> 1; + r0 = v_shr<8>(r1); + r1 = v_add(v_rotate_right<1>(r0), r0); + r1 = v_shr<1>(v_add(r1, delta1)); // r0 r2 ... r14 r1 r3 ... r15 r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); - b1 = (b0 ^ r0) & mask; - b0 = b0 ^ b1; - r0 = r0 ^ b1; + b1 = v_and(v_xor(b0, r0), mask); + b0 = v_xor(b0, b1); + r0 = v_xor(r0, b1); // b1 g1 b3 g3 b5 g5... v_uint8x16 pack_lo, pack_hi; @@ -1060,19 +1060,19 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) v_uint16x8 b0, b1, b2, b3, b4, b5, b6; - b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9); - b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9); - b2 = v_absdiff(s3, s7)<<1; - b3 = v_absdiff(s1, s9)<<1; + b0 = v_add(v_add(v_shl<1>(v_absdiff(s2, s8)), v_absdiff(s1, s7)), v_absdiff(s3, s9)); + b1 = v_add(v_add(v_shl<1>(v_absdiff(s4, s6)), v_absdiff(s1, s3)), v_absdiff(s7, s9)); + b2 = v_shl<1>(v_absdiff(s3, s7)); + b3 = v_shl<1>(v_absdiff(s1, s9)); v_store(brow, b0); v_store(brow + N, b1); v_store(brow + N2, b2); v_store(brow + N3, b3); - b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8); - b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8); - b6 = (s2 + s4 + s6 + s8)>>1; + b4 = v_add(v_add(b2, v_absdiff(s2, s4)), v_absdiff(s6, s8)); + b5 = v_add(v_add(b3, v_absdiff(s2, s6)), v_absdiff(s4, s8)); + b6 = v_shr<1>(v_add(v_add(v_add(s2, s4), s6), s8)); v_store(brow + N4, b4); v_store(brow + N5, b5); @@ -1279,7 +1279,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16(); v_float32x4 _0_5 = v_setall_f32(0.5f); - #define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA) + #define v_merge_u16(a, b) (v_or((v_and((a), v_reinterpret_as_u16(emask))), (v_and((b), v_reinterpret_as_u16(omask))))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA) #define v_cvt_s16f32_lo(a) v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f) #define v_cvt_s16f32_hi(a) v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f) @@ -1287,16 +1287,16 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 ) { //int gradN = brow0[0] + brow1[0]; - v_uint16x8 gradN = v_load(brow0) + v_load(brow1); + v_uint16x8 gradN = v_add(v_load(brow0), v_load(brow1)); //int gradS = brow1[0] + brow2[0]; - v_uint16x8 gradS = v_load(brow1) + v_load(brow2); + v_uint16x8 gradS = v_add(v_load(brow1), v_load(brow2)); //int gradW = brow1[N-1] + brow1[N]; - v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N); + v_uint16x8 gradW = v_add(v_load(brow1 + N - 1), v_load(brow1 + N)); //int gradE = brow1[N+1] + brow1[N]; - v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N); + v_uint16x8 gradE = v_add(v_load(brow1 + N + 1), v_load(brow1 + N)); //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE); //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE); @@ -1307,14 +1307,14 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) //int gradNE = brow0[N4+1] + brow1[N4]; //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1]; - grad0 = v_load(brow0+N4+1) + v_load(brow1+N4); - grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1); + grad0 = v_add(v_load(brow0 + N4 + 1), v_load(brow1 + N4)); + grad1 = v_add(v_add(v_add(v_load(brow0 + N2), v_load(brow0 + N2 + 1)), v_load(brow1 + N2)), v_load(brow1 + N2 + 1)); v_uint16x8 gradNE = v_merge_u16(grad0, grad1); //int gradSW = brow1[N4] + brow2[N4-1]; //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1]; - grad0 = v_load(brow2+N4-1) + v_load(brow1+N4); - grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1); + grad0 = v_add(v_load(brow2 + N4 - 1), v_load(brow1 + N4)); + grad1 = v_add(v_add(v_add(v_load(brow2 + N2), v_load(brow2 + N2 - 1)), v_load(brow1 + N2)), v_load(brow1 + N2 - 1)); v_uint16x8 gradSW = v_merge_u16(grad0, grad1); minGrad = v_min(v_min(minGrad, gradNE), gradSW); @@ -1322,21 +1322,21 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) //int gradNW = brow0[N5-1] + brow1[N5]; //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1]; - grad0 = v_load(brow0+N5-1) + v_load(brow1+N5); - grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1); + grad0 = v_add(v_load(brow0 + N5 - 1), v_load(brow1 + N5)); + grad1 = v_add(v_add(v_add(v_load(brow0 + N3), v_load(brow0 + N3 - 1)), v_load(brow1 + N3)), v_load(brow1 + N3 - 1)); v_uint16x8 gradNW = v_merge_u16(grad0, grad1); //int gradSE = brow1[N5] + brow2[N5+1]; //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1]; - grad0 = v_load(brow2+N5+1) + v_load(brow1+N5); - grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1); + grad0 = v_add(v_load(brow2 + N5 + 1), v_load(brow1 + N5)); + grad1 = v_add(v_add(v_add(v_load(brow2 + N3), v_load(brow2 + N3 + 1)), v_load(brow1 + N3)), v_load(brow1 + N3 + 1)); v_uint16x8 gradSE = v_merge_u16(grad0, grad1); minGrad = v_min(v_min(minGrad, gradNW), gradSE); maxGrad = v_max(v_max(maxGrad, gradNW), gradSE); //int T = minGrad + maxGrad/2; - v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad; + v_uint16x8 T = v_add(v_max((v_shr<1>(maxGrad)), one), minGrad); v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z; @@ -1361,133 +1361,135 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) v_uint16x8 t0, t1, mask; // gradN *********************************************** - mask = (T > gradN); // mask = T>gradN - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradN) + mask = (v_gt(T, gradN)); // mask = T>gradN + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradN) - t0 = (x3 << 1); // srow[-bstep]*2 - t1 = v_load_expand(srow - bstep*2) + x0; // srow[-bstep*2] + srow[0] + t0 = (v_shl<1>(x3)); // srow[-bstep]*2 + t1 = v_add(v_load_expand(srow - bstep * 2), x0); // srow[-bstep*2] + srow[0] // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN) - RGs += (t1 & mask); + RGs = v_add(RGs, v_and(t1, mask)); // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN) - GRs += (v_merge_u16(t0, x2 + x4) & mask); + GRs = v_add(GRs, (v_and(v_merge_u16(t0, v_add(x2, x4)), mask))); // Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN) - Bs += (v_merge_u16(x1 + x5, t0) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x5), t0), mask)); // gradNE ********************************************** - mask = (T > gradNE); // mask = T>gradNE - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNE) + mask = (v_gt(T, gradNE)); // mask = T>gradNE + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNE) - t0 = (x5 << 1); // srow[-bstep+1]*2 - t1 = v_load_expand(srow - bstep*2+2) + x0; // srow[-bstep*2+2] + srow[0] + t0 = (v_shl<1>(x5)); // srow[-bstep+1]*2 + t1 = v_add(v_load_expand(srow - bstep * 2 + 2), x0); // srow[-bstep*2+2] + srow[0] // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE) - RGs += (v_merge_u16(t1, t0) & mask); + RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask)); // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE) - GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask); + GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6+1), v_add(x4, x7)), mask)); // Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE) - Bs += (v_merge_u16(t0, x3 + x6) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x3, x6)), mask)); // gradE *********************************************** - mask = (T > gradE); // mask = T>gradE - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradE) + mask = (v_gt(T, gradE)); // mask = T>gradE + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradE) - t0 = (x7 << 1); // srow[1]*2 - t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0] + t0 = (v_shl<1>(x7)); // srow[1]*2 + t1 = v_add(v_load_expand(srow + 2), x0); // srow[2] + srow[0] // RGs += (srow[2] + srow[0]) * (T>gradE) - RGs += (t1 & mask); + RGs = v_add(RGs, v_and(t1, mask)); // GRs += (srow[1]*2) * (T>gradE) - GRs += (t0 & mask); + GRs = v_add(GRs, v_and(t0, mask)); // Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE) - Bs += (v_merge_u16(x5 + x9, x6 + x8) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(v_add(x5, x9), v_add(x6, x8)), mask)); // gradSE ********************************************** - mask = (T > gradSE); // mask = T>gradSE - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSE) + mask = (v_gt(T, gradSE)); // mask = T>gradSE + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSE) - t0 = (x9 << 1); // srow[bstep+1]*2 - t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0] + t0 = (v_shl<1>(x9)); // srow[bstep+1]*2 + t1 = v_add(v_load_expand(srow + bstep * 2 + 2), x0); // srow[bstep*2+2] + srow[0] // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE) - RGs += (v_merge_u16(t1, t0) & mask); + RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask)); // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE) - GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask); + GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6+1), v_add(x7, x10)), mask)); // Bs += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE) - Bs += (v_merge_u16((x9 << 1), x8 + x11) & mask); + Bs = v_add(Bs, v_and(v_merge_u16((v_shl<1>(x9)), v_add(x8, x11)), mask)); // gradS *********************************************** - mask = (T > gradS); // mask = T>gradS - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradS) + mask = (v_gt(T, gradS)); // mask = T>gradS + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradS) - t0 = (x11 << 1); // srow[bstep]*2 - t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0] + t0 = (v_shl<1>(x11)); // srow[bstep]*2 + t1 = v_add(v_load_expand(srow + bstep * 2), x0); // srow[bstep*2]+srow[0] // RGs += (srow[bstep*2]+srow[0]) * (T>gradS) - RGs += (t1 & mask); + RGs = v_add(RGs, v_and(t1, mask)); // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS) - GRs += (v_merge_u16(t0, x10 + x12) & mask); + GRs = v_add(GRs, v_and(v_merge_u16(t0, v_add(x10, x12)), mask)); // Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS) - Bs += (v_merge_u16(x9 + x13, t0) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(v_add(x9, x13), t0), mask)); // gradSW ********************************************** - mask = (T > gradSW); // mask = T>gradSW - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSW) + mask = (v_gt(T, gradSW)); // mask = T>gradSW + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradSW) - t0 = (x13 << 1); // srow[bstep-1]*2 - t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0] + t0 = (v_shl<1>(x13)); // srow[bstep-1]*2 + t1 = v_add(v_load_expand(srow + bstep * 2 - 2), x0); // srow[bstep*2-2]+srow[0] // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW) - RGs += (v_merge_u16(t1, t0) & mask); + RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask)); // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW) - GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask); + GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6-1), v_add(x12, x15)), mask)); // Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW) - Bs += (v_merge_u16(t0, x11 + x14) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x11, x14)), mask)); // gradW *********************************************** - mask = (T > gradW); // mask = T>gradW - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradW) + mask = (v_gt(T, gradW)); // mask = T>gradW + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradW) - t0 = (x15 << 1); // srow[-1]*2 - t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0] + t0 = (v_shl<1>(x15)); // srow[-1]*2 + t1 = v_add(v_load_expand(srow - 2), x0); // srow[-2]+srow[0] // RGs += (srow[-2]+srow[0]) * (T>gradW) - RGs += (t1 & mask); + RGs = v_add(RGs, v_and(t1, mask)); // GRs += (srow[-1]*2) * (T>gradW) - GRs += (t0 & mask); + GRs = v_add(GRs, v_and(t0, mask)); // Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW) - Bs += (v_merge_u16(x1 + x13, x14 + x16) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x13), v_add(x14, x16)), mask)); // gradNW ********************************************** - mask = (T > gradNW); // mask = T>gradNW - ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNW) + mask = (v_gt(T, gradNW)); // mask = T>gradNW + ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask))); // ng += (T>gradNW) - t0 = (x1 << 1); // srow[-bstep-1]*2 - t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0] + t0 = (v_shl<1>(x1)); // srow[-bstep-1]*2 + t1 = v_add(v_load_expand(srow - bstep * 2 - 2), x0); // srow[-bstep*2-2]+srow[0] // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW) - RGs += (v_merge_u16(t1, t0) & mask); + RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask)); // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW) - GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask); + GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6-1), v_add(x2, x15)), mask)); // Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW) - Bs += (v_merge_u16((x1 << 1), x3 + x16) & mask); + Bs = v_add(Bs, v_and(v_merge_u16(v_shl<1>(x1), v_add(x3, x16)), mask)); - v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng); - v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng); + v_float32x4 ngf0 = v_div(_0_5, v_cvt_s16f32_lo(ng)); + v_float32x4 ngf1 = v_div(_0_5, v_cvt_s16f32_hi(ng)); // now interpolate r, g & b - t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs)); - t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) - v_reinterpret_as_s16(RGs)); + t0 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(GRs), v_reinterpret_as_s16(RGs))); + t1 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(Bs), v_reinterpret_as_s16(RGs))); - t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) + + t0 = v_reinterpret_as_u16( + v_add(v_reinterpret_as_s16(x0), v_pack( - v_round(v_cvt_s16f32_lo(t0) * ngf0), - v_round(v_cvt_s16f32_hi(t0) * ngf1))); + v_round(v_mul(v_cvt_s16f32_lo(t0), ngf0)), + v_round(v_mul(v_cvt_s16f32_hi(t0), ngf1))))); - t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) + + t1 = v_reinterpret_as_u16( + v_add(v_reinterpret_as_s16(x0), v_pack( - v_round(v_cvt_s16f32_lo(t1) * ngf0), - v_round(v_cvt_s16f32_hi(t1) * ngf1))); + v_round(v_mul(v_cvt_s16f32_lo(t1), ngf0)), + v_round(v_mul(v_cvt_s16f32_hi(t1), ngf1))))); x1 = v_merge_u16(x0, t0); x2 = v_merge_u16(t0, x0); diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 06053e63fe..9306c78a30 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -1084,9 +1084,9 @@ struct SymmColumnVec_32s8u i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*v_int32x4::nlanes*/ ) + while( i <= width - 4 /*VTraits::vlanes()*/ ) #else - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) #endif { v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta)); @@ -1140,9 +1140,9 @@ struct SymmColumnVec_32s8u i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*v_int32x4::nlanes*/ ) + while( i <= width - 4 /*VTraits::vlanes()*/ ) #else - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) #endif { v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta)); @@ -1321,23 +1321,23 @@ struct SymmColumnSmallVec_32s16s { v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); v_int32 d4 = vx_setall_s32(d); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + for( ; i <= width - 2*VTraits::vlanes(); i += 2*VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)), - v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4)))); + v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)), + v_muladd(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits::vlanes()), k0, d4)))); + v_store(dst + i + VTraits::vlanes(), v_pack(v_muladd(v_add(vx_load(S0 + i + 2 * VTraits::vlanes()), vx_load(S2 + i + 2 * VTraits::vlanes())), k1, v_muladd(vx_load(S1 + i + 2*VTraits::vlanes()), k0, d4)), + v_muladd(v_add(vx_load(S0 + i + 3 * VTraits::vlanes()), vx_load(S2 + i + 3 * VTraits::vlanes())), k1, v_muladd(vx_load(S1 + i + 3*VTraits::vlanes()), k0, d4)))); } - if( i <= width - v_int16::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - i += v_int16::nlanes; + v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)), + v_muladd(v_add(vx_load(S0 + i + VTraits::vlanes()), vx_load(S2 + i + VTraits::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits::vlanes()), k0, d4)))); + i += VTraits::vlanes(); } - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) { - v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); - i += v_int32::nlanes; + v_pack_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4))); + i += VTraits::vlanes(); } } #endif @@ -2237,9 +2237,9 @@ struct FilterVec_8u i += VTraits::vlanes(); } #if CV_SIMD_WIDTH > 16 - while( i <= width - 4 /*v_int32x4::nlanes*/ ) + while( i <= width - 4 /*VTraits::vlanes()*/ ) #else - if( i <= width - v_int32::nlanes ) + if( i <= width - VTraits::vlanes() ) #endif { v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta)); @@ -2248,7 +2248,7 @@ struct FilterVec_8u v_int32 s32 = v_round(s0); v_int16 s16 = v_pack(s32, s32); *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16))); - i += 4 /*v_int32x4::nlanes*/ ; + i += 4 /*VTraits::vlanes()*/ ; } return i; } diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index cbd60550e0..7a52d0f3fe 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -2093,7 +2093,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) v_float32 v_s11 = vx_setzero_f32(); v_float32 v_s12 = vx_setzero_f32(); v_float32 v_s22 = vx_setzero_f32(); - for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_a = vx_load(h1 + j); v_float32 v_b = vx_load(h2 + j); @@ -2134,10 +2134,10 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) result += v_reduce_sum(v_result); #elif CV_SIMD v_float32 v_result = vx_setzero_f32(); - for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); - v_result += v_src; + v_result = v_add(v_result, v_src); } result += v_reduce_sum(v_result); #endif @@ -2174,7 +2174,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) v_float32 v_s1 = vx_setzero_f32(); v_float32 v_s2 = vx_setzero_f32(); v_float32 v_result = vx_setzero_f32(); - for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for (; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_a = vx_load(h1 + j); v_float32 v_b = vx_load(h2 + j); diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index bbeb8223f1..fc55b0f642 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -455,7 +455,7 @@ struct RemapVec_8u v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2); v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16))); int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4]; - const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes; + const uchar* src_limit_8bytes = _src.datalimit - VTraits::vlanes(); #define CV_PICK_AND_PACK_RGB(ptr, offset, result) \ { \ const uchar* const p = ((const uchar*)ptr) + (offset); \ @@ -483,7 +483,7 @@ struct RemapVec_8u v_uint8x16 rrggbbaa, dummy; \ v_uint16x8 rrggbbaa8, dummy8; \ v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \ - v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \ + v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + VTraits::vlanes()), 0, 0, 0)); \ v_zip(rgba0, rgba1, rrggbbaa, dummy); \ v_expand(rrggbbaa, rrggbbaa8, dummy8); \ result = v_reinterpret_as_s16(rrggbbaa8); \ @@ -534,8 +534,8 @@ struct RemapVec_8u v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta); v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3); - v0 = v0 >> INTER_REMAP_COEF_BITS; - v2 = v2 >> INTER_REMAP_COEF_BITS; + v0 = v_shr(v0); + v2 = v_shr(v2); v_pack_u_store(D + x, v_pack(v0, v2)); } } @@ -563,8 +563,8 @@ struct RemapVec_8u CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1); CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1); - v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; - v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + v_int32x4 result0 = v_shr(v_dotprod(u0, w00, v_dotprod(v0, w01, delta))); + v_int32x4 result1 = v_shr(v_dotprod(u1, w10, v_dotprod(v1, w11, delta))); result0 = v_rotate_left<1>(result0); v_int16x8 result8 = v_pack(result0, result1); @@ -581,8 +581,8 @@ struct RemapVec_8u CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1); CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1); - result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; - result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + result0 = v_shr(v_dotprod(u0, w00, v_dotprod(v0, w01, delta))); + result1 = v_shr(v_dotprod(u1, w10, v_dotprod(v1, w11, delta))); result0 = v_rotate_left<1>(result0); result8 = v_pack(result0, result1); @@ -613,8 +613,8 @@ struct RemapVec_8u CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1); CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1); - v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; - v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + v_int32x4 result0 = v_shr(v_dotprod(u0, w00, v_dotprod(v0, w01, delta))); + v_int32x4 result1 = v_shr(v_dotprod(u1, w10, v_dotprod(v1, w11, delta))); v_int16x8 result8 = v_pack(result0, result1); v_pack_u_store(D, result8); @@ -627,8 +627,8 @@ struct RemapVec_8u CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1); CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1); - result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; - result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + result0 = v_shr(v_dotprod(u0, w00, v_dotprod(v0, w01, delta))); + result1 = v_shr(v_dotprod(u1, w10, v_dotprod(v1, w11, delta))); result8 = v_pack(result0, result1); v_pack_u_store(D + 8, result8); } @@ -1164,7 +1164,7 @@ public: #if CV_SIMD128 { - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bcols - span * 2; x1 += span * 2 ) { v_int32x4 ix0 = v_round(v_load(sX + x1)); @@ -1206,9 +1206,9 @@ public: #if CV_SIMD128 { v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1); - int span = v_uint16x8::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bcols - span; x1 += span ) - v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale); + v_store((unsigned short*)(A + x1), v_and(v_load(sA + x1), v_scale)); } #endif for( ; x1 < bcols; x1++ ) @@ -1224,16 +1224,16 @@ public: { v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1); - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bcols - span * 2; x1 += span * 2 ) { - v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1)); - v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1)); - v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span)); - v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span)); - v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2)); - v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2)); - v_uint16x8 v_v = v_shl(v_sy8) | (v_sx8); + v_int32x4 v_sx0 = v_round(v_mul(v_scale, v_load(sX + x1))); + v_int32x4 v_sy0 = v_round(v_mul(v_scale, v_load(sY + x1))); + v_int32x4 v_sx1 = v_round(v_mul(v_scale, v_load(sX + x1 + span))); + v_int32x4 v_sy1 = v_round(v_mul(v_scale, v_load(sY + x1 + span))); + v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_and(v_sx0, v_scale2), v_and(v_sx1, v_scale2))); + v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_and(v_sy0, v_scale2), v_and(v_sy1, v_scale2))); + v_uint16x8 v_v = v_or(v_shl(v_sy8), v_sx8); v_store(A + x1, v_v); v_int16x8 v_d0 = v_pack(v_shr(v_sx0), v_shr(v_sx1)); @@ -1261,18 +1261,18 @@ public: { v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE); - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bcols - span * 2; x1 += span * 2 ) { v_float32x4 v_fx, v_fy; v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy); - v_int32x4 v_sx0 = v_round(v_fx * v_scale); - v_int32x4 v_sy0 = v_round(v_fy * v_scale); + v_int32x4 v_sx0 = v_round(v_mul(v_fx, v_scale)); + v_int32x4 v_sy0 = v_round(v_mul(v_fy, v_scale)); v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy); - v_int32x4 v_sx1 = v_round(v_fx * v_scale); - v_int32x4 v_sy1 = v_round(v_fy * v_scale); - v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2)); - v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2)); + v_int32x4 v_sx1 = v_round(v_mul(v_fx, v_scale)); + v_int32x4 v_sy1 = v_round(v_mul(v_fy, v_scale)); + v_int32x4 v_v0 = v_muladd(v_scale3, (v_and(v_sy0, v_scale2)), (v_and(v_sx0, v_scale2))); + v_int32x4 v_v1 = v_muladd(v_scale3, (v_and(v_sy1, v_scale2)), (v_and(v_sx1, v_scale2))); v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1)); v_store(A + x1, v_v8); v_int16x8 v_dx = v_pack(v_shr(v_sx0), v_shr(v_sx1)); @@ -1941,7 +1941,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { #if CV_SIMD128 { - int span = v_int16x8::nlanes; + int span = VTraits::vlanes(); for( ; x <= size.width - span; x += span ) { v_int16x8 v_dst[2]; @@ -1973,21 +1973,21 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE); - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x <= size.width - span * 2; x += span * 2 ) { - v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x))); - v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span))); - v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x))); - v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span))); + v_int32x4 v_ix0 = v_round(v_mul(v_scale, v_load(src1f + x))); + v_int32x4 v_ix1 = v_round(v_mul(v_scale, v_load(src1f + x + span))); + v_int32x4 v_iy0 = v_round(v_mul(v_scale, v_load(src2f + x))); + v_int32x4 v_iy1 = v_round(v_mul(v_scale, v_load(src2f + x + span))); v_int16x8 v_dst[2]; v_dst[0] = v_pack(v_shr(v_ix0), v_shr(v_ix1)); v_dst[1] = v_pack(v_shr(v_iy0), v_shr(v_iy1)); v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]); - v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)); - v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask)); + v_int32x4 v_dst0 = v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))); + v_int32x4 v_dst1 = v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask))); v_store(dst2 + x, v_pack_u(v_dst0, v_dst1)); } } @@ -2008,7 +2008,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, if( nninterpolate ) { #if CV_SIMD128 - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); { for( ; x <= (size.width << 1) - span * 2; x += span * 2 ) v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)), @@ -2034,16 +2034,16 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE); - int span = v_uint16x8::nlanes; + int span = VTraits::vlanes(); for (; x <= size.width - span; x += span ) { v_float32x4 v_src0[2], v_src1[2]; v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]); v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]); - v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale); - v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale); - v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale); - v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale); + v_int32x4 v_ix0 = v_round(v_mul(v_src0[0], v_scale)); + v_int32x4 v_ix1 = v_round(v_mul(v_src1[0], v_scale)); + v_int32x4 v_iy0 = v_round(v_mul(v_src0[1], v_scale)); + v_int32x4 v_iy1 = v_round(v_mul(v_src1[1], v_scale)); v_int16x8 v_dst[2]; v_dst[0] = v_pack(v_shr(v_ix0), v_shr(v_ix1)); @@ -2051,8 +2051,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]); v_store(dst2 + x, v_pack_u( - v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)), - v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask)))); + v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))), + v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask))))); } } #endif @@ -2074,13 +2074,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_uint16x8 v_mask2 = v_setall_u16(INTER_TAB_SIZE2-1); v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1); v_float32x4 v_scale = v_setall_f32(scale); - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x <= size.width - span * 2; x += span * 2 ) { v_uint32x4 v_fxy1, v_fxy2; if ( src2 ) { - v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2; + v_uint16x8 v_src2 = v_and(v_load(src2 + x), v_mask2); v_expand(v_src2, v_fxy1, v_fxy2); } else @@ -2091,9 +2091,9 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]); v_expand(v_src[0], v_src0[0], v_src0[1]); v_expand(v_src[1], v_src1[0], v_src1[1]); - #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\ + #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\ v_cvt_f32(v_reinterpret_as_s32(X))) - #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\ + #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr((FXY)))),\ v_cvt_f32(v_reinterpret_as_s32(Y))) v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1); v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1); @@ -2123,13 +2123,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1); v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1); v_float32x4 v_scale = v_setall_f32(scale); - int span = v_int16x8::nlanes; + int span = VTraits::vlanes(); for( ; x <= size.width - span; x += span ) { v_int32x4 v_fxy1, v_fxy2; if (src2) { - v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2; + v_int16x8 v_src2 = v_and(v_load((short *)src2 + x), v_mask2); v_expand(v_src2, v_fxy1, v_fxy2); } else @@ -2142,8 +2142,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_expand(v_src[0], v_src0[0], v_src0[1]); v_expand(v_src[1], v_src1[0], v_src1[1]); - #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X)) - #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y)) + #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X)) + #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr((FXY))), v_cvt_f32(Y)) v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1); v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1); v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]); @@ -2234,12 +2234,12 @@ public: #if CV_SIMD128 { v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0); - int span = v_uint16x8::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bw - span; x1 += span ) { v_int16x8 v_dst[2]; - #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr(shift+v_load(ptr + offset)),\ - v_shr(shift+v_load(ptr + offset + 4))) + #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr(v_add(shift,v_load(ptr + offset))),\ + v_shr(v_add(shift,v_load(ptr + offset + 4)))) v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0); v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0); #undef CV_CONVERT_MAP @@ -2272,21 +2272,21 @@ public: { v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0); v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); - int span = v_float32x4::nlanes; + int span = VTraits::vlanes(); for( ; x1 <= bw - span * 2; x1 += span * 2 ) { - v_int32x4 v_X0 = v_shr(v__X0 + v_load(adelta + x + x1)); - v_int32x4 v_Y0 = v_shr(v__Y0 + v_load(bdelta + x + x1)); - v_int32x4 v_X1 = v_shr(v__X0 + v_load(adelta + x + x1 + span)); - v_int32x4 v_Y1 = v_shr(v__Y0 + v_load(bdelta + x + x1 + span)); + v_int32x4 v_X0 = v_shr(v_add(v__X0, v_load(this->adelta + x + x1))); + v_int32x4 v_Y0 = v_shr(v_add(v__Y0, v_load(this->bdelta + x + x1))); + v_int32x4 v_X1 = v_shr(v_add(v__X0, v_load(this->adelta + x + x1 + span))); + v_int32x4 v_Y1 = v_shr(v_add(v__Y0, v_load(this->bdelta + x + x1 + span))); v_int16x8 v_xy[2]; v_xy[0] = v_pack(v_shr(v_X0), v_shr(v_X1)); v_xy[1] = v_pack(v_shr(v_Y0), v_shr(v_Y1)); v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]); - v_int32x4 v_alpha0 = v_shl(v_Y0 & v_mask) | (v_X0 & v_mask); - v_int32x4 v_alpha1 = v_shl(v_Y1 & v_mask) | (v_X1 & v_mask); + v_int32x4 v_alpha0 = v_or(v_shl(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask)); + v_int32x4 v_alpha1 = v_or(v_shl(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask)); v_store(alpha + x1, v_pack(v_alpha0, v_alpha1)); } } @@ -2866,16 +2866,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0 v_int32x4 v_X0, v_Y0; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X0 = v_round(v_fX0, v_fX1); v_Y0 = v_round(v_fY0, v_fY1); @@ -2885,16 +2885,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0 v_int32x4 v_X1, v_Y1; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X1 = v_round(v_fX0, v_fX1); v_Y1 = v_round(v_fY0, v_fY1); @@ -2904,16 +2904,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0 v_int32x4 v_X2, v_Y2; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X2 = v_round(v_fX0, v_fX1); v_Y2 = v_round(v_fY0, v_fY1); @@ -2923,16 +2923,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0 v_int32x4 v_X3, v_Y3; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X3 = v_round(v_fX0, v_fX1); v_Y3 = v_round(v_fY0, v_fY1); @@ -2987,16 +2987,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph v_int32x4 v_X0, v_Y0; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X0 = v_round(v_fX0, v_fX1); v_Y0 = v_round(v_fY0, v_fY1); @@ -3006,16 +3006,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph v_int32x4 v_X1, v_Y1; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X1 = v_round(v_fX0, v_fX1); v_Y1 = v_round(v_fY0, v_fY1); @@ -3025,16 +3025,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph v_int32x4 v_X2, v_Y2; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X2 = v_round(v_fX0, v_fX1); v_Y2 = v_round(v_fY0, v_fY1); @@ -3044,35 +3044,35 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph v_int32x4 v_X3, v_Y3; { v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_W = v_muladd(v_M6, v_x1, v_W0); - v_W = v_select(v_W != v_zero, v_its / v_W, v_zero); - v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W)); - v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W)); - v_x1 += v_2; + v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero); + v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W))); + v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W))); + v_x1 = v_add(v_x1, v_2); v_X3 = v_round(v_fX0, v_fX1); v_Y3 = v_round(v_fY0, v_fY1); } // store alpha - v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1); - v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1); + v_int32x4 v_alpha0 = v_add(v_shl(v_and(v_Y0, v_itsi1)), v_and(v_X0, v_itsi1)); + v_int32x4 v_alpha1 = v_add(v_shl(v_and(v_Y1, v_itsi1)), v_and(v_X1, v_itsi1)); v_store((alpha + x1), v_pack(v_alpha0, v_alpha1)); - v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1); - v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1); + v_alpha0 = v_add(v_shl(v_and(v_Y2, v_itsi1)), v_and(v_X2, v_itsi1)); + v_alpha1 = v_add(v_shl(v_and(v_Y3, v_itsi1)), v_and(v_X3, v_itsi1)); v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1)); // convert to 16s - v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS)); - v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS)); - v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS)); - v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS)); + v_X0 = v_reinterpret_as_s32(v_pack(v_shr(v_X0), v_shr(v_X1))); + v_X1 = v_reinterpret_as_s32(v_pack(v_shr(v_X2), v_shr(v_X3))); + v_Y0 = v_reinterpret_as_s32(v_pack(v_shr(v_Y0), v_shr(v_Y1))); + v_Y1 = v_reinterpret_as_s32(v_pack(v_shr(v_Y2), v_shr(v_Y3))); v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0)); v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1)); diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index 7d8423d322..1fe2e4060c 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -179,10 +179,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for (k = 0; k < 16; ++k) { #if CV_SIMD256 - v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k])); + v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v_add(v256_setall_u16(2 * r + 1), v256_load(H.fine[k])))); #elif CV_SIMD128 - v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k])); - v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8)); + v_store(H.fine[k], v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k)), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k]))); + v_store(H.fine[k] + 8, v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k] + 8))); #else for (int ind = 0; ind < 16; ++ind) H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); @@ -199,10 +199,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for( j = 0; j < 2*r; ++j, px += 16 ) { #if CV_SIMD256 - v_coarse += v256_load(px); + v_coarse = v_add(v_coarse, v256_load(px)); #elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); + v_coarsel = v_add(v_coarsel, v_load(px)); + v_coarseh = v_add(v_coarseh, v_load(px + 8)); #else for (int ind = 0; ind < 16; ++ind) H.coarse[ind] += px[ind]; @@ -216,11 +216,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) px = h_coarse + 16 * (n*c + std::min(j + r, n - 1)); #if CV_SIMD256 - v_coarse += v256_load(px); + v_coarse = v_add(v_coarse, v256_load(px)); v_store(H.coarse, v_coarse); #elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); + v_coarsel = v_add(v_coarsel, v_load(px)); + v_coarseh = v_add(v_coarseh, v_load(px + 8)); v_store(H.coarse, v_coarsel); v_store(H.coarse + 8, v_coarseh); #else @@ -261,10 +261,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16) { #if CV_SIMD256 - v_fine += v256_load(px); + v_fine = v_add(v_fine, v256_load(px)); #elif CV_SIMD128 - v_finel += v_load(px); - v_fineh += v_load(px + 8); + v_finel = v_add(v_finel, v_load(px)); + v_fineh = v_add(v_fineh, v_load(px + 8)); #else for (int ind = 0; ind < 16; ++ind) H.fine[k][ind] += px[ind]; @@ -275,10 +275,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) { px = h_fine + 16 * (n*(16 * c + k) + (n - 1)); #if CV_SIMD256 - v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)); + v_fine = v_add(v_fine, v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n))); #elif CV_SIMD128 - v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))); - v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); + v_finel = v_add(v_finel, v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n)))); + v_fineh = v_add(v_fineh, v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)))); #else for (int ind = 0; ind < 16; ++ind) H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]); @@ -298,10 +298,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for ( ; luc[k] < j+r+1; ++luc[k] ) { #if CV_SIMD256 - v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); + v_fine = v_sub(v_add(v_fine, v256_load(px + 16 * MIN(luc[k], n - 1))), v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0))); #elif CV_SIMD128 - v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); - v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8); + v_finel = v_sub(v_add(v_finel, v_load(px + 16 * MIN(luc[k], n - 1) )), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0))); + v_fineh = v_sub(v_add(v_fineh, v_load(px + 16 * MIN(luc[k], n - 1) + 8)), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8)); #else for (int ind = 0; ind < 16; ++ind) H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind]; @@ -312,12 +312,12 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) px = h_coarse + 16 * (n*c + MAX(j - r, 0)); #if CV_SIMD256 v_store(H.fine[k], v_fine); - v_coarse -= v256_load(px); + v_coarse = v_sub(v_coarse, v256_load(px)); #elif CV_SIMD128 v_store(H.fine[k], v_finel); v_store(H.fine[k] + 8, v_fineh); - v_coarsel -= v_load(px); - v_coarseh -= v_load(px + 8); + v_coarsel = v_sub(v_coarsel, v_load(px)); + v_coarseh = v_sub(v_coarseh, v_load(px + 8)); #else for (int ind = 0; ind < 16; ++ind) H.coarse[ind] -= px[ind]; diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 204c8654af..523ea586d4 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -236,12 +236,12 @@ struct MomentsInTile_SIMD v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x)); v_int16x8 sx = v_mul_wrap(qx, qx); - qx0 += v_reinterpret_as_u32(p); + qx0 = v_add(qx0, v_reinterpret_as_u32(p)); qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1))); qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2))); qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3))); - qx += dx; + qx = v_add(qx, dx); } x0 = v_reduce_sum(qx0); @@ -276,19 +276,19 @@ struct MomentsInTile_SIMD { v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x)); - v_x0 += v_reinterpret_as_u32(v_src); - v_x1 += v_reinterpret_as_u32(v_src * v_ix0); + v_x0 = v_add(v_x0, v_reinterpret_as_u32(v_src)); + v_x1 = v_add(v_x1, v_reinterpret_as_u32(v_mul(v_src, v_ix0))); - v_int32x4 v_ix1 = v_ix0 * v_ix0; - v_x2 += v_reinterpret_as_u32(v_src * v_ix1); + v_int32x4 v_ix1 = v_mul(v_ix0, v_ix0); + v_x2 = v_add(v_x2, v_reinterpret_as_u32(v_mul(v_src, v_ix1))); - v_ix1 = v_ix0 * v_ix1; - v_src = v_src * v_ix1; + v_ix1 = v_mul(v_ix0, v_ix1); + v_src = v_mul(v_src, v_ix1); v_uint64x2 v_lo, v_hi; v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi); - v_x3 += v_lo + v_hi; + v_x3 = v_add(v_x3, v_add(v_lo, v_hi)); - v_ix0 += v_delta; + v_ix0 = v_add(v_ix0, v_delta); } x0 = v_reduce_sum(v_x0); diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index dae09564d3..f65ae62158 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -463,7 +463,7 @@ template<> int PyrDownVecV(int** src, uchar* dst, int width) } #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; - for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + for ( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32x4 r0, r1, r2, r3, r4, t0; r0 = v_load(row0 + x); @@ -473,7 +473,7 @@ template<> int PyrDownVecV(int** src, uchar* dst, int width) r4 = v_load(row4 + x); t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2))); - *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); + *((unaligned_int*) (dst + x)) = v_get0(v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16()))); } #else for (; x <= width - 1; x += 1) @@ -615,15 +615,15 @@ template <> int PyrUpVecV(int** src, uchar** dst, int width) } #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; - for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); v_int32 v_2r10 = v_add(v_r10, v_r10); v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20))); - *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); - *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0(); + *(unaligned_int*)(dst0 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16()))); + *(unaligned_int*)(dst1 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16()))); } #else for (; x <= width - 1; x += 1) @@ -754,14 +754,14 @@ template <> int PyrUpVecVOneRow(int** src, uchar* dst, int width) } #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; - for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); v_int32 v_2r10 = v_add(v_r10, v_r10); v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20))); - *(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); + *(unaligned_int*)(dst + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16()))); } #else for (; x <= width - 1; x += 1) diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 1ad8e8932d..4668f0bdf3 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -2473,7 +2473,7 @@ public: v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_uint16 bl, gl, rl; #if CV_SIMD_WIDTH == 16 - bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; + bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5); #elif CV_SIMD_WIDTH == 32 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5); @@ -2493,7 +2493,7 @@ public: v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_uint16 bh, gh, rh; #if CV_SIMD_WIDTH == 16 - bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; + bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5); #elif CV_SIMD_WIDTH == 32 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5); @@ -2566,7 +2566,7 @@ public: v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0)); } #else - v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); + v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3))); #endif #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) @@ -2609,7 +2609,7 @@ public: } #elif CV_SIMD_WIDTH >= 64 v_uint32 masklow = vx_setall_u32(0x0000ffff); - for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { v_uint16 b0, g0, r0, b1, g1, r1; v_load_deinterleave(S0, b0, g0, r0); @@ -2617,8 +2617,8 @@ public: v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); - v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0); - v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1); + v_load_deinterleave(S0 + 3*VTraits::vlanes(), b0, g0, r0); + v_load_deinterleave(S1 + 3*VTraits::vlanes(), b1, g1, r1); v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); @@ -2630,7 +2630,7 @@ public: { CV_Assert(cn == 4); #if CV_SIMD_WIDTH >= 64 - for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes) + for ( ; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2*VTraits::vlanes(), S1 += 2*VTraits::vlanes(), D += VTraits::vlanes()) { v_uint64 r00, r01, r10, r11; v_load_deinterleave((uint64_t*)S0, r00, r01); @@ -2652,7 +2652,7 @@ public: r0 = v_add(r0, r2); r1 = v_add(r1, r3); v_uint32 v_d; #if CV_SIMD_WIDTH == 16 - v_d = r0 + r1; + v_d = v_add(r0, r1); #elif CV_SIMD_WIDTH == 32 v_uint32 t0, t1; v_recombine(r0, r1, t0, t1); @@ -2697,7 +2697,7 @@ public: { #if CV_SIMD_WIDTH == 16 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) - v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); + v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3))); #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { @@ -2738,7 +2738,7 @@ public: v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); } #elif CV_SIMD_WIDTH >= 64 - for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) + for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { v_int16 b0, g0, r0, b1, g1, r1; v_load_deinterleave(S0, b0, g0, r0); @@ -2746,8 +2746,8 @@ public: v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); - v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0); - v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1); + v_load_deinterleave(S0 + 3*VTraits::vlanes(), b0, g0, r0); + v_load_deinterleave(S1 + 3*VTraits::vlanes(), b1, g1, r1); v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); @@ -2779,7 +2779,7 @@ public: r3 = v_add(vx_load_expand(S0 + 3 * VTraits::vlanes()), vx_load_expand(S1 + 3 * VTraits::vlanes())); v_int32 dl, dh; #if CV_SIMD_WIDTH == 16 - dl = r0 + r1; dh = r2 + r3; + dl = v_add(r0, r1); dh = v_add(r2, r3); #elif CV_SIMD_WIDTH == 32 v_int32 t0, t1, t2, t3; v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3); @@ -2829,14 +2829,14 @@ struct ResizeAreaFastVec_SIMD_32f { #if CV_SIMD_WIDTH == 16 v_float32 v_025 = vx_setall_f32(0.25f); - for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) - v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025); + for (; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2*VTraits::vlanes(), S1 += 2*VTraits::vlanes(), D += VTraits::vlanes()) + v_store(D, v_mul(v_add(v_add(vx_load(S0), vx_load(S0 + VTraits::vlanes())), v_add(vx_load(S1), vx_load(S1 + VTraits::vlanes()))), v_025)); #elif CV_SIMD256 v_float32x8 v_025 = v256_setall_f32(0.25f); - for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes) + for (; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2*VTraits::vlanes(), S1 += 2*VTraits::vlanes(), D += VTraits::vlanes()) { v_float32x8 dst0, dst1; - v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1); + v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + VTraits::vlanes()), v256_load(S1 + VTraits::vlanes())), dst0, dst1); v_store(D, v_mul(v_add(dst0, dst1), v_025)); } #endif diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp index f5f3a92d85..208ffc1231 100644 --- a/modules/imgproc/src/sumpixels.simd.hpp +++ b/modules/imgproc/src/sumpixels.simd.hpp @@ -114,7 +114,7 @@ struct Integral_SIMD v_int32 prev = vx_setzero_s32(); int j = 0; - for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for ( ; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_int32 el4l, el4h; @@ -127,8 +127,8 @@ struct Integral_SIMD el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask)); prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask); #else - el8 += v_rotate_left<1>(el8); - el8 += v_rotate_left<2>(el8); + el8 = v_add(el8, v_rotate_left<1>(el8)); + el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 el8 += v_rotate_left<4>(el8); #if CV_SIMD_WIDTH == 64 @@ -136,12 +136,12 @@ struct Integral_SIMD #endif #endif v_expand(el8, el4l, el4h); - el4l += prev; - el4h += el4l; - prev = v_broadcast_element(el4h); + el4l = v_add(el4l, prev); + el4h = v_add(el4h, el4l); + prev = v_broadcast_highest(el4h); #endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits::vlanes()))); } for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) @@ -162,11 +162,11 @@ struct Integral_SIMD v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32(); int j = 0; - for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + for ( ; j + VTraits::vlanes() * cn <= width; j += VTraits::vlanes() * cn) { v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); - v_int16 el8_1 = v_src_row & mask; - v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_int16 el8_1 = v_and(v_src_row, mask); + v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row))); v_int32 el4l_1, el4h_1, el4l_2, el4h_2; #if CV_AVX2 && CV_SIMD_WIDTH == 32 __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); @@ -183,10 +183,10 @@ struct Integral_SIMD prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask); prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2)); + el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 el8_1 += v_rotate_left<4>(el8_1); el8_2 += v_rotate_left<4>(el8_2); @@ -197,20 +197,20 @@ struct Integral_SIMD #endif v_expand(el8_1, el4l_1, el4h_1); v_expand(el8_2, el4l_2, el4h_2); - el4l_1 += prev_1; - el4l_2 += prev_2; - el4h_1 += el4l_1; - el4h_2 += el4l_2; - prev_1 = v_broadcast_element(el4h_1); - prev_2 = v_broadcast_element(el4h_2); + el4l_1 = v_add(el4l_1, prev_1); + el4l_2 = v_add(el4l_2, prev_2); + el4h_1 = v_add(el4h_1, el4l_1); + el4h_2 = v_add(el4h_2, el4l_2); + prev_1 = v_broadcast_highest(el4h_1); + prev_2 = v_broadcast_highest(el4h_2); #endif v_int32 el4_1, el4_2, el4_3, el4_4; v_zip(el4l_1, el4l_2, el4_1, el4_2); v_zip(el4h_1, el4h_2, el4_3, el4_4); - v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_int32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes )); - v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2)); - v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3)); + v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); } for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -230,7 +230,7 @@ struct Integral_SIMD const uchar * src_row = src + _srcstep * i; int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn; int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn; - int row_cache[v_int32::nlanes * 6]; + int row_cache[VTraits::max_nlanes * 6]; sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; @@ -238,10 +238,10 @@ struct Integral_SIMD prev_3 = vx_setzero_s32(); int j = 0; const int j_max = - ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height) - ? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave() - : width - v_uint16::nlanes * cn; // v_expand_low - for ( ; j <= j_max; j += v_uint16::nlanes * cn) + ((_srcstep * i + (width - VTraits::vlanes() * cn + VTraits::vlanes() * cn)) >= _srcstep * height) + ? width - VTraits::vlanes() * cn // uint8 in v_load_deinterleave() + : width - VTraits::vlanes() * cn; // v_expand_low + for ( ; j <= j_max; j += VTraits::vlanes() * cn) { v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); @@ -270,49 +270,49 @@ struct Integral_SIMD prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask); prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_3 += v_rotate_left<1>(el8_3); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); - el8_3 += v_rotate_left<2>(el8_3); + el8_1 = v_add(el8_1,v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2,v_rotate_left<1>(el8_2)); + el8_3 = v_add(el8_3,v_rotate_left<1>(el8_3)); + el8_1 = v_add(el8_1,v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2,v_rotate_left<2>(el8_2)); + el8_3 = v_add(el8_3,v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); - el8_3 += v_rotate_left<4>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); - el8_3 += v_rotate_left<8>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3)); #endif #endif v_expand(el8_1, el4l_1, el4h_1); v_expand(el8_2, el4l_2, el4h_2); v_expand(el8_3, el4l_3, el4h_3); - el4l_1 += prev_1; - el4l_2 += prev_2; - el4l_3 += prev_3; - el4h_1 += el4l_1; - el4h_2 += el4l_2; - el4h_3 += el4l_3; - prev_1 = v_broadcast_element(el4h_1); - prev_2 = v_broadcast_element(el4h_2); - prev_3 = v_broadcast_element(el4h_3); + el4l_1 = v_add(el4l_1, prev_1); + el4l_2 = v_add(el4l_2, prev_2); + el4l_3 = v_add(el4l_3, prev_3); + el4h_1 = v_add(el4h_1, el4l_1); + el4h_2 = v_add(el4h_2, el4l_2); + el4h_3 = v_add(el4h_3, el4l_3); + prev_1 = v_broadcast_highest(el4h_1); + prev_2 = v_broadcast_highest(el4h_2); + prev_3 = v_broadcast_highest(el4h_3); #endif v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3); - v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3); + v_store_interleave(row_cache + VTraits::vlanes() * 3, el4h_1, el4h_2, el4h_3); el4l_1 = vx_load(row_cache ); - el4l_2 = vx_load(row_cache + v_int32::nlanes ); - el4l_3 = vx_load(row_cache + v_int32::nlanes * 2); - el4h_1 = vx_load(row_cache + v_int32::nlanes * 3); - el4h_2 = vx_load(row_cache + v_int32::nlanes * 4); - el4h_3 = vx_load(row_cache + v_int32::nlanes * 5); - v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_int32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes )); - v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2)); - v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3)); - v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4)); - v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5)); + el4l_2 = vx_load(row_cache + VTraits::vlanes() ); + el4l_3 = vx_load(row_cache + VTraits::vlanes() * 2); + el4h_1 = vx_load(row_cache + VTraits::vlanes() * 3); + el4h_2 = vx_load(row_cache + VTraits::vlanes() * 4); + el4h_3 = vx_load(row_cache + VTraits::vlanes() * 5); + v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j ))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits::vlanes() ))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); + v_store(sum_row + j + VTraits::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits::vlanes() * 4))); + v_store(sum_row + j + VTraits::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 5))); } for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -339,7 +339,7 @@ struct Integral_SIMD v_int32 prev = vx_setzero_s32(); int j = 0; - for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for ( ; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_int32 el4l, el4h; @@ -356,8 +356,8 @@ struct Integral_SIMD #endif #endif v_expand(el8, el4l, el4h); - el4l += prev; - el4h += el4l; + el4l = v_add(el4l, prev); + el4h = v_add(el4h, el4l); #if CV_SIMD_WIDTH == 16 prev = el4h; #elif CV_SIMD_WIDTH == 32 @@ -368,8 +368,8 @@ struct Integral_SIMD prev = v_combine_low(t, t); #endif #endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits::vlanes()))); } for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -426,7 +426,7 @@ struct Integral_SIMD v_float32 prev = vx_setzero_f32(); int j = 0; - for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for (; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_float32 el4l, el4h; @@ -439,8 +439,8 @@ struct Integral_SIMD el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask)); prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask); #else - el8 += v_rotate_left<1>(el8); - el8 += v_rotate_left<2>(el8); + el8 = v_add(el8, v_rotate_left<1>(el8)); + el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 el8 += v_rotate_left<4>(el8); #if CV_SIMD_WIDTH == 64 @@ -449,12 +449,12 @@ struct Integral_SIMD #endif v_int32 el4li, el4hi; v_expand(el8, el4li, el4hi); - el4l = v_cvt_f32(el4li) + prev; - el4h = v_cvt_f32(el4hi) + el4l; - prev = v_broadcast_element(el4h); + el4l = v_add(v_cvt_f32(el4li), prev); + el4h = v_add(v_cvt_f32(el4hi), el4l); + prev = v_broadcast_highest(el4h); #endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); + v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits::vlanes()))); } for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) @@ -475,11 +475,11 @@ struct Integral_SIMD v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32(); int j = 0; - for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + for (; j + VTraits::vlanes() * cn <= width; j += VTraits::vlanes() * cn) { v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); - v_int16 el8_1 = v_src_row & mask; - v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_int16 el8_1 = v_and(v_src_row, mask); + v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row))); v_float32 el4l_1, el4h_1, el4l_2, el4h_2; #if CV_AVX2 && CV_SIMD_WIDTH == 32 __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); @@ -496,10 +496,10 @@ struct Integral_SIMD prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask); prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2)); + el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 el8_1 += v_rotate_left<4>(el8_1); el8_2 += v_rotate_left<4>(el8_2); @@ -511,20 +511,20 @@ struct Integral_SIMD v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; v_expand(el8_1, el4li_1, el4hi_1); v_expand(el8_2, el4li_2, el4hi_2); - el4l_1 = v_cvt_f32(el4li_1) + prev_1; - el4l_2 = v_cvt_f32(el4li_2) + prev_2; - el4h_1 = v_cvt_f32(el4hi_1) + el4l_1; - el4h_2 = v_cvt_f32(el4hi_2) + el4l_2; - prev_1 = v_broadcast_element(el4h_1); - prev_2 = v_broadcast_element(el4h_2); + el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1); + el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2); + el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1); + el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2); + prev_1 = v_broadcast_highest(el4h_1); + prev_2 = v_broadcast_highest(el4h_2); #endif v_float32 el4_1, el4_2, el4_3, el4_4; v_zip(el4l_1, el4l_2, el4_1, el4_2); v_zip(el4h_1, el4h_2, el4_3, el4_4); - v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes )); - v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2)); - v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3)); + v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); } for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -543,7 +543,7 @@ struct Integral_SIMD const uchar * src_row = src + _srcstep * i; float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn; float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn; - float row_cache[v_float32::nlanes * 6]; + float row_cache[VTraits::max_nlanes * 6]; sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; @@ -551,10 +551,10 @@ struct Integral_SIMD prev_3 = vx_setzero_f32(); int j = 0; const int j_max = - ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height) - ? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave() - : width - v_uint16::nlanes * cn; // v_expand_low - for ( ; j <= j_max; j += v_uint16::nlanes * cn) + ((_srcstep * i + (width - VTraits::vlanes() * cn + VTraits::vlanes() * cn)) >= _srcstep * height) + ? width - VTraits::vlanes() * cn // uint8 in v_load_deinterleave() + : width - VTraits::vlanes() * cn; // v_expand_low + for ( ; j <= j_max; j += VTraits::vlanes() * cn) { v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); @@ -583,12 +583,12 @@ struct Integral_SIMD prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask); prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_3 += v_rotate_left<1>(el8_3); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); - el8_3 += v_rotate_left<2>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3)); + el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 el8_1 += v_rotate_left<4>(el8_1); el8_2 += v_rotate_left<4>(el8_2); @@ -603,30 +603,30 @@ struct Integral_SIMD v_expand(el8_1, el4li_1, el4hi_1); v_expand(el8_2, el4li_2, el4hi_2); v_expand(el8_3, el4li_3, el4hi_3); - el4l_1 = v_cvt_f32(el4li_1) + prev_1; - el4l_2 = v_cvt_f32(el4li_2) + prev_2; - el4l_3 = v_cvt_f32(el4li_3) + prev_3; - el4h_1 = v_cvt_f32(el4hi_1) + el4l_1; - el4h_2 = v_cvt_f32(el4hi_2) + el4l_2; - el4h_3 = v_cvt_f32(el4hi_3) + el4l_3; - prev_1 = v_broadcast_element(el4h_1); - prev_2 = v_broadcast_element(el4h_2); - prev_3 = v_broadcast_element(el4h_3); + el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1); + el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2); + el4l_3 = v_add(v_cvt_f32(el4li_3), prev_3); + el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1); + el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2); + el4h_3 = v_add(v_cvt_f32(el4hi_3), el4l_3); + prev_1 = v_broadcast_highest(el4h_1); + prev_2 = v_broadcast_highest(el4h_2); + prev_3 = v_broadcast_highest(el4h_3); #endif v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3); - v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3); + v_store_interleave(row_cache + VTraits::vlanes() * 3, el4h_1, el4h_2, el4h_3); el4l_1 = vx_load(row_cache ); - el4l_2 = vx_load(row_cache + v_float32::nlanes ); - el4l_3 = vx_load(row_cache + v_float32::nlanes * 2); - el4h_1 = vx_load(row_cache + v_float32::nlanes * 3); - el4h_2 = vx_load(row_cache + v_float32::nlanes * 4); - el4h_3 = vx_load(row_cache + v_float32::nlanes * 5); - v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes )); - v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2)); - v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3)); - v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4)); - v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5)); + el4l_2 = vx_load(row_cache + VTraits::vlanes() ); + el4l_3 = vx_load(row_cache + VTraits::vlanes() * 2); + el4h_1 = vx_load(row_cache + VTraits::vlanes() * 3); + el4h_2 = vx_load(row_cache + VTraits::vlanes() * 4); + el4h_3 = vx_load(row_cache + VTraits::vlanes() * 5); + v_store(sum_row + j , v_add(el4l_1, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); + v_store(sum_row + j + VTraits::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits::vlanes() * 4))); + v_store(sum_row + j + VTraits::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 5))); } for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -652,7 +652,7 @@ struct Integral_SIMD v_float32 prev = vx_setzero_f32(); int j = 0; - for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for ( ; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_float32 el4l, el4h; @@ -670,8 +670,8 @@ struct Integral_SIMD #endif v_int32 el4li, el4hi; v_expand(el8, el4li, el4hi); - el4l = v_cvt_f32(el4li) + prev; - el4h = v_cvt_f32(el4hi) + el4l; + el4l = v_add(v_cvt_f32(el4li), prev); + el4h = v_add(v_cvt_f32(el4hi), el4l); #if CV_SIMD_WIDTH == 16 prev = el4h; #elif CV_SIMD_WIDTH == 32 @@ -682,8 +682,8 @@ struct Integral_SIMD prev = v_combine_low(t, t); #endif #endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); + v_store(sum_row + j , v_add(el4l, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits::vlanes()))); } for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -750,7 +750,7 @@ struct Integral_SIMD v_float64 prev = vx_setzero_f64(); int j = 0; - for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for (; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_float64 el4ll, el4lh, el4hl, el4hh; @@ -767,8 +767,8 @@ struct Integral_SIMD el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d); prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff); #else - el8 += v_rotate_left<1>(el8); - el8 += v_rotate_left<2>(el8); + el8 = v_add(el8, v_rotate_left<1>(el8)); + el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 el8 += v_rotate_left<4>(el8); #if CV_SIMD_WIDTH == 64 @@ -777,17 +777,17 @@ struct Integral_SIMD #endif v_int32 el4li, el4hi; v_expand(el8, el4li, el4hi); - el4ll = v_cvt_f64(el4li) + prev; - el4lh = v_cvt_f64_high(el4li) + prev; - el4hl = v_cvt_f64(el4hi) + el4ll; - el4hh = v_cvt_f64_high(el4hi) + el4lh; - prev = vx_setall_f64(v_extract_n(el4hh)); -// prev = v_broadcast_element(el4hh); + el4ll = v_add(v_cvt_f64(el4li), prev); + el4lh = v_add(v_cvt_f64_high(el4li), prev); + el4hl = v_add(v_cvt_f64(el4hi), el4ll); + el4hh = v_add(v_cvt_f64_high(el4hi), el4lh); + prev = vx_setall_f64(v_extract_highest(el4hh)); +// prev = v_broadcast_highest(el4hh); #endif - v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes )); - v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); - v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); + v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); } for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) @@ -808,11 +808,11 @@ struct Integral_SIMD v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(); int j = 0; - for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + for (; j + VTraits::vlanes() * cn <= width; j += VTraits::vlanes() * cn) { v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); - v_int16 el8_1 = v_src_row & mask; - v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_int16 el8_1 = v_and(v_src_row, mask); + v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row))); v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2; #if CV_AVX2 && CV_SIMD_WIDTH == 32 __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); @@ -838,10 +838,10 @@ struct Integral_SIMD prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff); prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2)); + el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 el8_1 += v_rotate_left<4>(el8_1); el8_2 += v_rotate_left<4>(el8_2); @@ -853,32 +853,32 @@ struct Integral_SIMD v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; v_expand(el8_1, el4li_1, el4hi_1); v_expand(el8_2, el4li_2, el4hi_2); - el4ll_1 = v_cvt_f64(el4li_1) + prev_1; - el4ll_2 = v_cvt_f64(el4li_2) + prev_2; - el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1; - el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2; - el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1; - el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2; - el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1; - el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2; - prev_1 = vx_setall_f64(v_extract_n(el4hh_1)); - prev_2 = vx_setall_f64(v_extract_n(el4hh_2)); -// prev_1 = v_broadcast_element(el4hh_1); -// prev_2 = v_broadcast_element(el4hh_2); + el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1); + el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2); + el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1); + el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2); + el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1); + el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2); + el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1); + el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2); + prev_1 = vx_setall_f64(v_extract_highest(el4hh_1)); + prev_2 = vx_setall_f64(v_extract_highest(el4hh_2)); +// prev_1 = v_broadcast_highest(el4hh_1); +// prev_2 = v_broadcast_highest(el4hh_2); #endif v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8; v_zip(el4ll_1, el4ll_2, el4_1, el4_2); v_zip(el4lh_1, el4lh_2, el4_3, el4_4); v_zip(el4hl_1, el4hl_2, el4_5, el4_6); v_zip(el4hh_1, el4hh_2, el4_7, el4_8); - v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float64::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes )); - v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); - v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); - v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4)); - v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5)); - v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6)); - v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7)); + v_store(sum_row + j , v_add(el4_1, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4_2, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); + v_store(sum_row + j + VTraits::vlanes() * 4, v_add(el4_5, vx_load(prev_sum_row + j + VTraits::vlanes() * 4))); + v_store(sum_row + j + VTraits::vlanes() * 5, v_add(el4_6, vx_load(prev_sum_row + j + VTraits::vlanes() * 5))); + v_store(sum_row + j + VTraits::vlanes() * 6, v_add(el4_7, vx_load(prev_sum_row + j + VTraits::vlanes() * 6))); + v_store(sum_row + j + VTraits::vlanes() * 7, v_add(el4_8, vx_load(prev_sum_row + j + VTraits::vlanes() * 7))); } for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -897,7 +897,7 @@ struct Integral_SIMD const uchar * src_row = src + _srcstep * i; double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn; double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn; - double row_cache[v_float64::nlanes * 12]; + double row_cache[VTraits::max_nlanes * 12]; sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; @@ -905,10 +905,10 @@ struct Integral_SIMD prev_3 = vx_setzero_f64(); int j = 0; const int j_max = - ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height) - ? width - v_uint8::nlanes * cn // uint8 in v_load_deinterleave() - : width - v_uint16::nlanes * cn; // v_expand_low - for ( ; j <= j_max; j += v_uint16::nlanes * cn) + ((_srcstep * i + (width - VTraits::vlanes() * cn + VTraits::vlanes() * cn)) >= _srcstep * height) + ? width - VTraits::vlanes() * cn // uint8 in v_load_deinterleave() + : width - VTraits::vlanes() * cn; // v_expand_low + for ( ; j <= j_max; j += VTraits::vlanes() * cn) { v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); @@ -951,12 +951,12 @@ struct Integral_SIMD prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff); prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff); #else - el8_1 += v_rotate_left<1>(el8_1); - el8_2 += v_rotate_left<1>(el8_2); - el8_3 += v_rotate_left<1>(el8_3); - el8_1 += v_rotate_left<2>(el8_1); - el8_2 += v_rotate_left<2>(el8_2); - el8_3 += v_rotate_left<2>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3)); + el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 el8_1 += v_rotate_left<4>(el8_1); el8_2 += v_rotate_left<4>(el8_2); @@ -971,53 +971,53 @@ struct Integral_SIMD v_expand(el8_1, el4li_1, el4hi_1); v_expand(el8_2, el4li_2, el4hi_2); v_expand(el8_3, el4li_3, el4hi_3); - el4ll_1 = v_cvt_f64(el4li_1) + prev_1; - el4ll_2 = v_cvt_f64(el4li_2) + prev_2; - el4ll_3 = v_cvt_f64(el4li_3) + prev_3; - el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1; - el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2; - el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3; - el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1; - el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2; - el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3; - el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1; - el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2; - el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3; - prev_1 = vx_setall_f64(v_extract_n(el4hh_1)); - prev_2 = vx_setall_f64(v_extract_n(el4hh_2)); - prev_3 = vx_setall_f64(v_extract_n(el4hh_3)); -// prev_1 = v_broadcast_element(el4hh_1); -// prev_2 = v_broadcast_element(el4hh_2); -// prev_3 = v_broadcast_element(el4hh_3); + el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1); + el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2); + el4ll_3 = v_add(v_cvt_f64(el4li_3), prev_3); + el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1); + el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2); + el4lh_3 = v_add(v_cvt_f64_high(el4li_3), prev_3); + el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1); + el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2); + el4hl_3 = v_add(v_cvt_f64(el4hi_3), el4ll_3); + el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1); + el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2); + el4hh_3 = v_add(v_cvt_f64_high(el4hi_3), el4lh_3); + prev_1 = vx_setall_f64(v_extract_highest(el4hh_1)); + prev_2 = vx_setall_f64(v_extract_highest(el4hh_2)); + prev_3 = vx_setall_f64(v_extract_highest(el4hh_3)); +// prev_1 = v_broadcast_highest(el4hh_1); +// prev_2 = v_broadcast_highest(el4hh_2); +// prev_3 = v_broadcast_highest(el4hh_3); #endif v_store_interleave(row_cache , el4ll_1, el4ll_2, el4ll_3); - v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3); - v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3); - v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3); + v_store_interleave(row_cache + VTraits::vlanes() * 3, el4lh_1, el4lh_2, el4lh_3); + v_store_interleave(row_cache + VTraits::vlanes() * 6, el4hl_1, el4hl_2, el4hl_3); + v_store_interleave(row_cache + VTraits::vlanes() * 9, el4hh_1, el4hh_2, el4hh_3); el4ll_1 = vx_load(row_cache ); - el4ll_2 = vx_load(row_cache + v_float64::nlanes ); - el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 ); - el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 ); - el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 ); - el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 ); - el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 ); - el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 ); - el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 ); - el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 ); - el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10); - el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11); - v_store(sum_row + j , el4ll_1 + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float64::nlanes , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes )); - v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 )); - v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 )); - v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 )); - v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 )); - v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 )); - v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 )); - v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 )); - v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 )); - v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10)); - v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11)); + el4ll_2 = vx_load(row_cache + VTraits::vlanes() ); + el4ll_3 = vx_load(row_cache + VTraits::vlanes() * 2 ); + el4lh_1 = vx_load(row_cache + VTraits::vlanes() * 3 ); + el4lh_2 = vx_load(row_cache + VTraits::vlanes() * 4 ); + el4lh_3 = vx_load(row_cache + VTraits::vlanes() * 5 ); + el4hl_1 = vx_load(row_cache + VTraits::vlanes() * 6 ); + el4hl_2 = vx_load(row_cache + VTraits::vlanes() * 7 ); + el4hl_3 = vx_load(row_cache + VTraits::vlanes() * 8 ); + el4hh_1 = vx_load(row_cache + VTraits::vlanes() * 9 ); + el4hh_2 = vx_load(row_cache + VTraits::vlanes() * 10); + el4hh_3 = vx_load(row_cache + VTraits::vlanes() * 11); + v_store(sum_row + j , v_add(el4ll_1, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4ll_2, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2 , v_add(el4ll_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3 , v_add(el4lh_1, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); + v_store(sum_row + j + VTraits::vlanes() * 4 , v_add(el4lh_2, vx_load(prev_sum_row + j + VTraits::vlanes() * 4))); + v_store(sum_row + j + VTraits::vlanes() * 5 , v_add(el4lh_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 5))); + v_store(sum_row + j + VTraits::vlanes() * 6 , v_add(el4hl_1, vx_load(prev_sum_row + j + VTraits::vlanes() * 6))); + v_store(sum_row + j + VTraits::vlanes() * 7 , v_add(el4hl_2, vx_load(prev_sum_row + j + VTraits::vlanes() * 7))); + v_store(sum_row + j + VTraits::vlanes() * 8 , v_add(el4hl_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 8))); + v_store(sum_row + j + VTraits::vlanes() * 9 , v_add(el4hh_1, vx_load(prev_sum_row + j + VTraits::vlanes() * 9))); + v_store(sum_row + j + VTraits::vlanes() * 10, v_add(el4hh_2, vx_load(prev_sum_row + j + VTraits::vlanes() * 10))); + v_store(sum_row + j + VTraits::vlanes() * 11, v_add(el4hh_3, vx_load(prev_sum_row + j + VTraits::vlanes() * 11))); } for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1], @@ -1043,7 +1043,7 @@ struct Integral_SIMD v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(); int j = 0; - for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + for ( ; j + VTraits::vlanes() <= width; j += VTraits::vlanes()) { v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); v_float64 el4ll, el4lh, el4hl, el4hh; @@ -1065,10 +1065,10 @@ struct Integral_SIMD #endif v_int32 el4li, el4hi; v_expand(el8, el4li, el4hi); - el4ll = v_cvt_f64(el4li) + prev_1; - el4lh = v_cvt_f64_high(el4li) + prev_2; - el4hl = v_cvt_f64(el4hi) + el4ll; - el4hh = v_cvt_f64_high(el4hi) + el4lh; + el4ll = v_add(v_cvt_f64(el4li), prev_1); + el4lh = v_add(v_cvt_f64_high(el4li), prev_2); + el4hl = v_add(v_cvt_f64(el4hi), el4ll); + el4hh = v_add(v_cvt_f64_high(el4hi), el4lh); #if CV_SIMD_WIDTH == 16 prev_1 = el4hl; prev_2 = el4hh; @@ -1078,10 +1078,10 @@ struct Integral_SIMD prev_1 = prev_2 = v_combine_high(el4hh, el4hh); #endif #endif - v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes )); - v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); - v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); + v_store(sum_row + j , v_add(el4ll, vx_load(prev_sum_row + j))); + v_store(sum_row + j + VTraits::vlanes() , v_add(el4lh, vx_load(prev_sum_row + j + VTraits::vlanes()))); + v_store(sum_row + j + VTraits::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits::vlanes() * 2))); + v_store(sum_row + j + VTraits::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits::vlanes() * 3))); } for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1], diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index b57e92ff9a..b83263304f 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -268,13 +268,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp for ( i = 0; i < 256; i += 4) { v_store(_data + i, v_sqrt(idx)); - idx += ifour; + idx = v_add(idx, ifour); } else for ( i = 0; i < 256; i += 4) { v_store(_data + i, idx); - idx += ifour; + idx = v_add(idx, ifour); } #else if( gammaCorrection ) @@ -320,7 +320,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp for ( ; x <= end - 4; x += 4) { v_int32x4 mul_res = v_load(xmap + x); - mul_res += mul_res + mul_res; + mul_res = v_add(mul_res, v_add(mul_res, mul_res)); v_store(xmap + x, mul_res); } #endif @@ -444,34 +444,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp { int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; - v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0); - v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1); - v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2); + v_float32x4 _dx0 = v_sub(v_load(lutCurr + x + widthP2 * 0 + 2), v_load(lutCurr + x + widthP2 * 0)); + v_float32x4 _dx1 = v_sub(v_load(lutCurr + x + widthP2 * 1 + 2), v_load(lutCurr + x + widthP2 * 1)); + v_float32x4 _dx2 = v_sub(v_load(lutCurr + x + widthP2 * 2 + 2), v_load(lutCurr + x + widthP2 * 2)); v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]); - v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1); + v_float32x4 _dy0 = v_sub(_dy00, v_load(lutPrev + x + widthP2 * 0 + 1)); v_store(lutNext+x+widthP2*0+1, _dy00); v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]); - v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1); + v_float32x4 _dy1 = v_sub(_dy10, v_load(lutPrev + x + widthP2 * 1 + 1)); v_store(lutNext+x+widthP2*1+1, _dy10); v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]); - v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1); + v_float32x4 _dy2 = v_sub(_dy20, v_load(lutPrev + x + widthP2 * 2 + 1)); v_store(lutNext+x+widthP2*2+1, _dy20); - v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0); - v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1); - v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2); + v_float32x4 _mag0 = v_add(v_mul(_dx0, _dx0), v_mul(_dy0, _dy0)); + v_float32x4 _mag1 = v_add(v_mul(_dx1, _dx1), v_mul(_dy1, _dy1)); + v_float32x4 _mag2 = v_add(v_mul(_dx2, _dx2), v_mul(_dy2, _dy2)); - v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1); + v_float32x4 mask = v_reinterpret_as_f32(v_gt(_mag2, _mag1)); _dx2 = v_select(mask, _dx2, _dx1); _dy2 = v_select(mask, _dy2, _dy1); - mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0); + mask = v_reinterpret_as_f32(v_gt(v_max(_mag2, _mag1), _mag0)); _dx2 = v_select(mask, _dx2, _dx0); _dy2 = v_select(mask, _dy2, _dy0); @@ -537,25 +537,25 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp int x2 = x << 1; v_float32x4 _mag = v_load(dbuf + x + (width << 1)); v_float32x4 _angle = v_load(dbuf + x + width * 3); - _angle = (_angleScale * _angle) - fhalf; + _angle = v_sub(v_mul(_angleScale, _angle), fhalf); v_int32x4 _hidx = v_floor(_angle); - _angle -= v_cvt_f32(_hidx); + _angle = v_sub(_angle, v_cvt_f32(_hidx)); - v_float32x4 ft0 = _mag * (fone - _angle); - v_float32x4 ft1 = _mag * _angle; + v_float32x4 ft0 = v_mul(_mag, v_sub(fone, _angle)); + v_float32x4 ft1 = v_mul(_mag, _angle); v_store_interleave(gradPtr + x2, ft0, ft1); - v_int32x4 mask0 = _hidx >> 31; - v_int32x4 it0 = mask0 & _nbins; - mask0 = (_hidx >= _nbins); - v_int32x4 it1 = mask0 & _nbins; - _hidx += (it0 - it1); + v_int32x4 mask0 = v_shr<31>(_hidx); + v_int32x4 it0 = v_and(mask0, _nbins); + mask0 = (v_ge(_hidx, _nbins)); + v_int32x4 it1 = v_and(mask0, _nbins); + _hidx = v_add(_hidx, v_sub(it0, it1)); it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero))); - _hidx += ione; - _hidx &= (_hidx < _nbins); + _hidx = v_add(_hidx, ione); + _hidx = v_and(_hidx, v_lt(_hidx, _nbins)); it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero))); v_uint8x16 it2, it3; v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3); @@ -707,9 +707,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor, for (; i <= blockSize.height - 4; i += 4) { - v_float32x4 t = idx - _bh; - t *= t; - idx += ifour; + v_float32x4 t = v_sub(idx, _bh); + t = v_mul(t, t); + idx = v_add(idx, ifour); v_store(_di + i, t); } #endif @@ -725,9 +725,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor, for (; j <= blockSize.height - 4; j += 4) { - v_float32x4 t = idx - _bw; - t *= t; - idx += ifour; + v_float32x4 t = v_sub(idx, _bw); + t = v_mul(t, t); + idx = v_add(idx, ifour); v_store(_dj + j, t); } #endif @@ -936,8 +936,8 @@ const float* HOGCache::getBlock(Point pt, float* buf) int h0 = h[0], h1 = h[1]; v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]); - v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights); - v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w; + v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights)); + v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w); v_store(hist0, _t0); v_store(hist1, _t1); @@ -984,8 +984,8 @@ const float* HOGCache::getBlock(Point pt, float* buf) int h0 = h[0], h1 = h[1]; v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]); - v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights); - v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w; + v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights)); + v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w); v_store(hist0, _t0); v_store(hist1, _t1); @@ -1057,12 +1057,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const #if CV_SIMD128 v_float32x4 p0 = v_load(hist); - v_float32x4 s = p0 * p0; + v_float32x4 s = v_mul(p0, p0); for (i = 4; i <= sz - 4; i += 4) { p0 = v_load(hist + i); - s += p0 * p0; + s = v_add(s, v_mul(p0, p0)); } v_store(partSum, s); #else @@ -1091,17 +1091,17 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const v_float32x4 _scale = v_setall_f32(scale); static v_float32x4 _threshold = v_setall_f32(thresh); - v_float32x4 p = _scale * v_load(hist); + v_float32x4 p = v_mul(_scale, v_load(hist)); p = v_min(p, _threshold); - s = p * p; + s = v_mul(p, p); v_store(hist, p); for(i = 4 ; i <= sz - 4; i += 4) { p = v_load(hist + i); - p *= _scale; + p = v_mul(p, _scale); p = v_min(p, _threshold); - s += p * p; + s = v_add(s, v_mul(p, p)); v_store(hist + i, p); } @@ -1137,7 +1137,7 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const v_float32x4 _scale2 = v_setall_f32(scale); for ( ; i <= sz - 4; i += 4) { - v_float32x4 t = _scale2 * v_load(hist + i); + v_float32x4 t = v_mul(_scale2, v_load(hist + i)); v_store(hist + i, t); } #endif @@ -1593,14 +1593,14 @@ void HOGDescriptor::detect(InputArray _img, #if CV_SIMD128 v_float32x4 _vec = v_load(vec); v_float32x4 _svmVec = v_load(svmVec); - v_float32x4 sum = _svmVec * _vec; + v_float32x4 sum = v_mul(_svmVec, _vec); for( k = 4; k <= blockHistogramSize - 4; k += 4 ) { _vec = v_load(vec + k); _svmVec = v_load(svmVec + k); - sum += _vec * _svmVec; + sum = v_add(sum, v_mul(_vec, _svmVec)); } v_store(partSum, sum); @@ -3392,14 +3392,14 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector &loc #if CV_SIMD128 v_float32x4 _vec = v_load(vec); v_float32x4 _svmVec = v_load(svmVec); - v_float32x4 sum = _svmVec * _vec; + v_float32x4 sum = v_mul(_svmVec, _vec); for( k = 4; k <= blockHistogramSize - 4; k += 4 ) { _vec = v_load(vec + k); _svmVec = v_load(svmVec + k); - sum += _vec * _svmVec; + sum = v_add(sum, v_mul(_vec, _svmVec)); } v_store(partSum, sum); diff --git a/modules/video/src/dis_flow.cpp b/modules/video/src/dis_flow.cpp index a260b8726b..40ac4517a4 100644 --- a/modules/video/src/dis_flow.cpp +++ b/modules/video/src/dis_flow.cpp @@ -520,16 +520,16 @@ DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOp v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \ \ /* Compute diffs between I0 and bilinearly interpolated I1: */ \ - I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) + \ - w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) + \ - w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) + \ - w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) - \ - v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)); \ - I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) + \ - w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) + \ - w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) + \ - w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) - \ - v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)); + I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \ + v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \ + v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \ + v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \ + v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \ + I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \ + v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \ + v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \ + v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \ + v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right))); #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \ I0_ptr += I0_stride; \ @@ -572,9 +572,9 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar * v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right); /* Update the sums: */ - Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right); - Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right); - SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right; + Ux_vec = v_add(Ux_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right)))); + Uy_vec = v_add(Uy_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right)))); + SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right))); I0x_ptr += I0_stride; I0y_ptr += I0_stride; @@ -640,10 +640,10 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right); /* Update the sums: */ - sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right); - sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right); - sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right; - sum_diff_vec += I_diff_left + I_diff_right; + sum_I0x_mul_vec = v_add(sum_I0x_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right)))); + sum_I0y_mul_vec = v_add(sum_I0y_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right)))); + sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right))); + sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right)); I0x_ptr += I0_stride; I0y_ptr += I0_stride; @@ -692,7 +692,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri for (int row = 0; row < 8; row++) { HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; - SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right; + SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right))); HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; } SSD = v_reduce_sum(SSD_vec); @@ -728,8 +728,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int for (int row = 0; row < 8; row++) { HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; - sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right; - sum_diff_vec += I_diff_left + I_diff_right; + sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right))); + sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right)); HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; } sum_diff = v_reduce_sum(sum_diff_vec); diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 8467035dbf..6d51c0cf1a 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -97,8 +97,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); - v_int16x8 t1 = s2 - s0; - v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + v_int16x8 t1 = v_sub(s2, s0); + v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10)); v_store(trow0 + x, t0); v_store(trow1 + x, t1); @@ -134,8 +134,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const v_int16x8 s3 = v_load(trow1 + x); v_int16x8 s4 = v_load(trow1 + x + cn); - v_int16x8 t0 = s1 - s0; - v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + v_int16x8 t0 = v_sub(s1, s0); + v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10)); v_store_interleave((drow + x*2), t0, t1); } @@ -293,10 +293,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1); - t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1); - t0 = t0 >> (W_BITS1-5); - t1 = t1 >> (W_BITS1-5); + t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1)); + t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1)); + t0 = v_shr(t0); + t1 = v_shr(t1); v_store(Iptr + x, v_pack(t0, t1)); v00 = v_reinterpret_as_s16(v_load(dsrc)); @@ -307,10 +307,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1); - t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1); - t0 = t0 >> W_BITS1; - t1 = t1 >> W_BITS1; + t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1)); + t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1)); + t0 = v_shr(t0); + t1 = v_shr(t1); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v_store(dIptr, v00); @@ -332,10 +332,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1); - t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1); - t0 = t0 >> W_BITS1; - t1 = t1 >> W_BITS1; + t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1)); + t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1)); + t0 = v_shr(t0); + t1 = v_shr(t1); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v_store(dIptr + 4*2, v00); @@ -548,18 +548,18 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1); - t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1); - t0 = t0 >> (W_BITS1-5); - t1 = t1 >> (W_BITS1-5); - diff0 = v_pack(t0, t1) - diff0; + t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1)); + t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1)); + t0 = v_shr(t0); + t1 = v_shr(t1); + diff0 = v_sub(v_pack(t0, t1), diff0); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v01 = v_reinterpret_as_s16(v_load(dIptr + 8)); v_zip(v00, v01, v10, v11); v_zip(diff2, diff1, v00, v01); - qb0 += v_cvt_f32(v_dotprod(v00, v10)); - qb1 += v_cvt_f32(v_dotprod(v01, v11)); + qb0 = v_add(qb0, v_cvt_f32(v_dotprod(v00, v10))); + qb1 = v_add(qb1, v_cvt_f32(v_dotprod(v01, v11))); } #endif @@ -647,7 +647,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const #if CV_SIMD128 && !CV_NEON v_float32x4 qf0, qf1; - v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1); + v_recombine(v_interleave_pairs(v_add(qb0, qb1)), v_setzero_f32(), qf0, qf1); ib1 += v_reduce_sum(qf0); ib2 += v_reduce_sum(qf1); #endif diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp index 2b164b62d3..02e878a577 100644 --- a/modules/video/src/optflowgf.cpp +++ b/modules/video/src/optflowgf.cpp @@ -463,22 +463,22 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, const float *sptr0 = srow[m], *sptr1; v_float32x4 g4 = v_load(simd_kernel); v_float32x4 s0, s1, s2, s3; - s0 = v_load(sptr0 + x) * g4; - s1 = v_load(sptr0 + x + 4) * g4; - s2 = v_load(sptr0 + x + 8) * g4; - s3 = v_load(sptr0 + x + 12) * g4; + s0 = v_mul(v_load(sptr0 + x), g4); + s1 = v_mul(v_load(sptr0 + x + 4), g4); + s2 = v_mul(v_load(sptr0 + x + 8), g4); + s3 = v_mul(v_load(sptr0 + x + 12), g4); for( i = 1; i <= m; i++ ) { v_float32x4 x0, x1; sptr0 = srow[m+i], sptr1 = srow[m-i]; g4 = v_load(simd_kernel + i*4); - x0 = v_load(sptr0 + x) + v_load(sptr1 + x); - x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4); + x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x)); + x1 = v_add(v_load(sptr0 + x + 4), v_load(sptr1 + x + 4)); s0 = v_muladd(x0, g4, s0); s1 = v_muladd(x1, g4, s1); - x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8); - x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12); + x0 = v_add(v_load(sptr0 + x + 8), v_load(sptr1 + x + 8)); + x1 = v_add(v_load(sptr0 + x + 12), v_load(sptr1 + x + 12)); s2 = v_muladd(x0, g4, s2); s3 = v_muladd(x1, g4, s3); } @@ -493,13 +493,13 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, { const float *sptr0 = srow[m], *sptr1; v_float32x4 g4 = v_load(simd_kernel); - v_float32x4 s0 = v_load(sptr0 + x) * g4; + v_float32x4 s0 = v_mul(v_load(sptr0 + x), g4); for( i = 1; i <= m; i++ ) { sptr0 = srow[m+i], sptr1 = srow[m-i]; g4 = v_load(simd_kernel + i*4); - v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x); + v_float32x4 x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x)); s0 = v_muladd(x0, g4, s0); } v_store(vsum + x, s0); @@ -528,14 +528,14 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, for( ; x <= width*5 - 8; x += 8 ) { v_float32x4 g4 = v_load(simd_kernel); - v_float32x4 s0 = v_load(vsum + x) * g4; - v_float32x4 s1 = v_load(vsum + x + 4) * g4; + v_float32x4 s0 = v_mul(v_load(vsum + x), g4); + v_float32x4 s1 = v_mul(v_load(vsum + x + 4), g4); for( i = 1; i <= m; i++ ) { g4 = v_load(simd_kernel + i*4); - v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5); - v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4); + v_float32x4 x0 = v_add(v_load(vsum + x - i * 5), v_load(vsum + x + i * 5)); + v_float32x4 x1 = v_add(v_load(vsum + x - i * 5 + 4), v_load(vsum + x + i * 5 + 4)); s0 = v_muladd(x0, g4, s0); s1 = v_muladd(x1, g4, s1); } diff --git a/modules/video/src/variational_refinement.cpp b/modules/video/src/variational_refinement.cpp index cca30f1ce7..968bce6717 100644 --- a/modules/video/src/variational_refinement.cpp +++ b/modules/video/src/variational_refinement.cpp @@ -651,15 +651,15 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range pdU_vec = v_load(pdU + j); pdV_vec = v_load(pdV + j); - derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec; - Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec; - weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec; + derivNorm_vec = v_add(v_add(v_mul(pIx_vec, pIx_vec), v_mul(pIy_vec, pIy_vec)), zeta_vec); + Ik1z_vec = v_add(v_add(pIz_vec, v_mul(pIx_vec, pdU_vec)), v_mul(pIy_vec, pdV_vec)); + weight_vec = v_div(v_div(delta_vec, v_sqrt(v_add(v_div(v_mul(Ik1z_vec, Ik1z_vec), derivNorm_vec), eps_vec))), derivNorm_vec); - pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec; - pa12_vec = weight_vec * (pIx_vec * pIy_vec); - pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec; - pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec); - pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec); + pa11_vec = v_add(v_mul(weight_vec, v_mul(pIx_vec, pIx_vec)), zeta_vec); + pa12_vec = v_mul(weight_vec, v_mul(pIx_vec, pIy_vec)); + pa22_vec = v_add(v_mul(weight_vec, v_mul(pIy_vec, pIy_vec)), zeta_vec); + pb1_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIx_vec))); + pb2_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIy_vec))); pIxx_vec = v_load(pIxx + j); pIxy_vec = v_load(pIxy + j); @@ -667,18 +667,17 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range pIxz_vec = v_load(pIxz + j); pIyz_vec = v_load(pIyz + j); - derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec; - derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec; - Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec; - Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec; - weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec + - Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec); + derivNorm_vec = v_add(v_add(v_mul(pIxx_vec, pIxx_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec); + derivNorm2_vec = v_add(v_add(v_mul(pIyy_vec, pIyy_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec); + Ik1zx_vec = v_add(v_add(pIxz_vec, v_mul(pIxx_vec, pdU_vec)), v_mul(pIxy_vec, pdV_vec)); + Ik1zy_vec = v_add(v_add(pIyz_vec, v_mul(pIxy_vec, pdU_vec)), v_mul(pIyy_vec, pdV_vec)); + weight_vec = v_div(gamma_vec, v_sqrt(v_add(v_add(v_div(v_mul(Ik1zx_vec, Ik1zx_vec), derivNorm_vec), v_div(v_mul(Ik1zy_vec, Ik1zy_vec), derivNorm2_vec)), eps_vec))); - pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec); - pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec); - pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec); - pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec); - pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec); + pa11_vec = v_add(pa11_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxx_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm2_vec)))); + pa12_vec = v_add(pa12_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyy_vec), derivNorm2_vec)))); + pa22_vec = v_add(pa22_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyy_vec), derivNorm2_vec)))); + pb1_vec = v_sub(pb1_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyz_vec), derivNorm2_vec)))); + pb2_vec = v_sub(pb2_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyz_vec), derivNorm2_vec)))); v_store(pa11 + j, pa11_vec); v_store(pa12 + j, pa12_vec); @@ -850,26 +849,26 @@ void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator() cW_u_vec = v_load(cW_u + j); cW_v_vec = v_load(cW_v + j); - ux_vec = v_load(cW_u_next + j) - cW_u_vec; - vx_vec = v_load(cW_v_next + j) - cW_v_vec; - uy_vec = v_load(cW_u_next_row + j) - cW_u_vec; - vy_vec = v_load(cW_v_next_row + j) - cW_v_vec; + ux_vec = v_sub(v_load(cW_u_next + j), cW_u_vec); + vx_vec = v_sub(v_load(cW_v_next + j), cW_v_vec); + uy_vec = v_sub(v_load(cW_u_next_row + j), cW_u_vec); + vy_vec = v_sub(v_load(cW_v_next_row + j), cW_v_vec); pWeight_vec = - alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec); + v_div(alpha2_vec, v_sqrt(v_add(v_add(v_add(v_add(v_mul(ux_vec, ux_vec), v_mul(vx_vec, vx_vec)), v_mul(uy_vec, uy_vec)), v_mul(vy_vec, vy_vec)), eps_vec))); v_store(pWeight + j, pWeight_vec); - ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j)); - vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j)); + ux_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next + j), v_load(pW_u + j))); + vx_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next + j), v_load(pW_v + j))); - v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec); - v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec); - v_store(pB_u + j, v_load(pB_u + j) + ux_vec); - v_store(pB_v + j, v_load(pB_v + j) + vx_vec); + v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec)); + v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec)); + v_store(pB_u + j, v_add(v_load(pB_u + j), ux_vec)); + v_store(pB_v + j, v_add(v_load(pB_v + j), vx_vec)); - v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec); - v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec); - v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec); - v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec); + v_store(pA_u_next + j, v_add(v_load(pA_u_next + j), pWeight_vec)); + v_store(pA_v_next + j, v_add(v_load(pA_v_next + j), pWeight_vec)); + v_store(pB_u_next + j, v_sub(v_load(pB_u_next + j), ux_vec)); + v_store(pB_v_next + j, v_sub(v_load(pB_v_next + j), vx_vec)); } #endif for (; j < len - 1; j++) @@ -956,18 +955,18 @@ void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator( for (; j < len - 3; j += 4) { pWeight_vec = v_load(pWeight + j); - uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j)); - vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j)); - - v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec); - v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec); - v_store(pB_u + j, v_load(pB_u + j) + uy_vec); - v_store(pB_v + j, v_load(pB_v + j) + vy_vec); - - v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec); - v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec); - v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec); - v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec); + uy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next_row + j), v_load(pW_u + j))); + vy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next_row + j), v_load(pW_v + j))); + + v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec)); + v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec)); + v_store(pB_u + j, v_add(v_load(pB_u + j), uy_vec)); + v_store(pB_v + j, v_add(v_load(pB_v + j), vy_vec)); + + v_store(pA_u_next_row + j, v_add(v_load(pA_u_next_row + j), pWeight_vec)); + v_store(pA_v_next_row + j, v_add(v_load(pA_v_next_row + j), pWeight_vec)); + v_store(pB_u_next_row + j, v_sub(v_load(pB_u_next_row + j), uy_vec)); + v_store(pB_v_next_row + j, v_sub(v_load(pB_v_next_row + j), vy_vec)); } #endif for (; j < len; j++) @@ -1084,15 +1083,13 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran pdv_shifted_vec = v_reinterpret_as_f32( v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec))); - sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec + - pW_vec * pdu_next_row_vec; - sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec + - pW_vec * pdv_next_row_vec; + sigmaU_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdu_shifted_vec), v_mul(pW_vec, pdu_next_vec)), v_mul(pW_prev_row_vec, pdu_prev_row_vec)), v_mul(pW_vec, pdu_next_row_vec)); + sigmaV_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdv_shifted_vec), v_mul(pW_vec, pdv_next_vec)), v_mul(pW_prev_row_vec, pdv_prev_row_vec)), v_mul(pW_vec, pdv_next_row_vec)); pdu_vec = v_load(pdu + j); pdv_vec = v_load(pdv + j); - pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec); - pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec); + pdu_vec = v_add(pdu_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaU_vec, v_load(pb1 + j)), v_mul(pdv_vec, pa12_vec)), v_load(pa11 + j)), pdu_vec))); + pdv_vec = v_add(pdv_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaV_vec, v_load(pb2 + j)), v_mul(pdu_vec, pa12_vec)), v_load(pa22 + j)), pdv_vec))); v_store(pdu + j, pdu_vec); v_store(pdv + j, pdv_vec); diff --git a/samples/cpp/simd_basic.cpp b/samples/cpp/simd_basic.cpp index 9af4d91cef..ef78c39a45 100644 --- a/samples/cpp/simd_basic.cpp +++ b/samples/cpp/simd_basic.cpp @@ -38,8 +38,8 @@ int main(int /*argc*/, char** /*argv*/) printf("================== arithm check =================\n"); v_uint8 a = vx_setall_u8(10); - v_uint8 c = a + vx_setall_u8(45); - printf("(vx_setall_u8(10) + vx_setall_u8(45)).get0() => %d\n", (int)c.get0()); + v_uint8 c = v_add(a, vx_setall_u8(45)); + printf("v_get0(vx_setall_u8(10) + vx_setall_u8(45)) => %d\n", (int)v_get0(c)); #else printf("\nSIMD intrinsics are not available. Check compilation target and passed build options.\n"); #endif diff --git a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp index 9be4170d7b..52018461c3 100644 --- a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp +++ b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp @@ -85,7 +85,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int //! [convolution-1D-main] //! [convolution-1D-main-h1] - int step = v_float32().nlanes; + int step = VTraits::vlanes(); float *sptr = src_32.ptr(row), *kptr = kernel.ptr(rowk); for (int k = 0; k < ksize; k++) { @@ -96,7 +96,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int for (i = 0; i + step < len; i += step) { v_float32 window = vx_load(sptr + i + k); - v_float32 sum = vx_load(ans + i) + kernel_wide * window; + v_float32 sum = v_add(vx_load(ans + i), v_mul(kernel_wide, window)); v_store(ans + i, sum); } //! [convolution-1D-main-h2] @@ -122,7 +122,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel) copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE); - int step = v_float32().nlanes; + int step = VTraits::vlanes(); //! [convolution-2D-init] //! [convolution-2D-main] @@ -135,7 +135,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel) int j; for (j = 0; j + step < cols; j += step) { - v_float32 sum = vx_load(&dst.ptr(i)[j]) + vx_load(&ans[j]); + v_float32 sum = v_add(vx_load(&dst.ptr(i)[j]), vx_load(&ans[j])); v_store(&dst.ptr(i)[j], sum); }