From c8080aa4155b5e21bcdd291deee2357cfbca84a3 Mon Sep 17 00:00:00 2001 From: Wanli Date: Fri, 13 Sep 2024 16:56:48 +0900 Subject: [PATCH] Merge pull request #26109 from WanliZhong:univ_intrin_operator2warpper Replace operators with wrapper functions on universal intrinsics backends #26109 This PR aims to replace the operators(logic, arithmetic, bit) with wrapper functions(v_add, v_eq, v_and...) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- .../core/include/opencv2/core/hal/intrin.hpp | 387 +++--------------- .../include/opencv2/core/hal/intrin_avx.hpp | 270 ++++++------ .../opencv2/core/hal/intrin_avx512.hpp | 249 ++++++----- .../include/opencv2/core/hal/intrin_cpp.hpp | 140 +++---- .../include/opencv2/core/hal/intrin_lasx.hpp | 208 +++++----- .../include/opencv2/core/hal/intrin_lsx.hpp | 190 ++++----- .../include/opencv2/core/hal/intrin_msa.hpp | 123 +++--- .../opencv2/core/hal/intrin_rvv071.hpp | 168 +++----- .../include/opencv2/core/hal/intrin_sse.hpp | 221 +++++----- .../include/opencv2/core/hal/intrin_vsx.hpp | 112 +++-- .../include/opencv2/core/hal/intrin_wasm.hpp | 191 +++++---- modules/features2d/src/fast.avx2.cpp | 2 +- modules/imgproc/src/resize.cpp | 4 +- modules/imgproc/src/sumpixels.simd.hpp | 76 ++-- 14 files changed, 962 insertions(+), 1379 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index c9407a1d43..28fe6a02a9 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -717,44 +717,70 @@ namespace CV__SIMD_NAMESPACE { /** @brief SIMD processing state cleanup call */ inline void vx_cleanup() { VXPREFIX(_cleanup)(); } -#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP)) +#if !CV_SIMD_SCALABLE // Compatibility layer - +#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP)) template struct VTraits { static inline int vlanes() { return T::nlanes; } enum { nlanes = T::nlanes, max_nlanes = T::nlanes }; using lane_type = typename T::lane_type; }; - #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ - inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a + b; \ - } \ - inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \ + //////////// get0 //////////// + #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \ + inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \ { \ - return a - b; \ - } \ - template \ - inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_add(f1 + f2, vf...); \ + return v.get0(); \ } - #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \ - inline _Tpvec v_shr(const _Tpvec& a, int n) \ - { \ - return a >> n; \ - } \ - inline _Tpvec v_shl(const _Tpvec& a, int n) \ - { \ - return a << n; \ + + OPENCV_HAL_WRAP_GRT0(v_uint8) + OPENCV_HAL_WRAP_GRT0(v_int8) + OPENCV_HAL_WRAP_GRT0(v_uint16) + OPENCV_HAL_WRAP_GRT0(v_int16) + OPENCV_HAL_WRAP_GRT0(v_uint32) + OPENCV_HAL_WRAP_GRT0(v_int32) + OPENCV_HAL_WRAP_GRT0(v_uint64) + OPENCV_HAL_WRAP_GRT0(v_int64) + OPENCV_HAL_WRAP_GRT0(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0(v_float64) + #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_GRT0(v_uint8x16) + OPENCV_HAL_WRAP_GRT0(v_uint16x8) + OPENCV_HAL_WRAP_GRT0(v_uint32x4) + OPENCV_HAL_WRAP_GRT0(v_uint64x2) + OPENCV_HAL_WRAP_GRT0(v_int8x16) + OPENCV_HAL_WRAP_GRT0(v_int16x8) + OPENCV_HAL_WRAP_GRT0(v_int32x4) + OPENCV_HAL_WRAP_GRT0(v_int64x2) + OPENCV_HAL_WRAP_GRT0(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_GRT0(v_uint8x32) + OPENCV_HAL_WRAP_GRT0(v_uint16x16) + OPENCV_HAL_WRAP_GRT0(v_uint32x8) + OPENCV_HAL_WRAP_GRT0(v_uint64x4) + OPENCV_HAL_WRAP_GRT0(v_int8x32) + OPENCV_HAL_WRAP_GRT0(v_int16x16) + OPENCV_HAL_WRAP_GRT0(v_int32x8) + OPENCV_HAL_WRAP_GRT0(v_int64x4) + OPENCV_HAL_WRAP_GRT0(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0(v_float64x4) + #endif + #endif +#endif + + #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ + template \ + inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \ + return v_add(v_add(f1, f2), f3, vf...); \ } - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) @@ -778,12 +804,6 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2) #endif @@ -799,110 +819,15 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4) #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4) #endif #endif - #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \ - inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a & b; \ - } \ - inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a | b; \ - } \ - inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a ^ b; \ - } - - #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \ - inline _Tpvec v_not(const _Tpvec& a) \ - { \ - return ~a; \ - } - - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32) - OPENCV_HAL_WRAP_NOT_OP(v_uint8) - OPENCV_HAL_WRAP_NOT_OP(v_uint16) - OPENCV_HAL_WRAP_NOT_OP(v_uint32) - OPENCV_HAL_WRAP_NOT_OP(v_uint64) - OPENCV_HAL_WRAP_NOT_OP(v_int8) - OPENCV_HAL_WRAP_NOT_OP(v_int16) - OPENCV_HAL_WRAP_NOT_OP(v_int32) - OPENCV_HAL_WRAP_NOT_OP(v_int64) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4) - OPENCV_HAL_WRAP_NOT_OP(v_uint8x16) - OPENCV_HAL_WRAP_NOT_OP(v_uint16x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint32x4) - OPENCV_HAL_WRAP_NOT_OP(v_uint64x2) - OPENCV_HAL_WRAP_NOT_OP(v_int8x16) - OPENCV_HAL_WRAP_NOT_OP(v_int16x8) - OPENCV_HAL_WRAP_NOT_OP(v_int32x4) - OPENCV_HAL_WRAP_NOT_OP(v_int64x2) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint8x32) - OPENCV_HAL_WRAP_NOT_OP(v_uint16x16) - OPENCV_HAL_WRAP_NOT_OP(v_uint32x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint64x4) - OPENCV_HAL_WRAP_NOT_OP(v_int8x32) - OPENCV_HAL_WRAP_NOT_OP(v_int16x16) - OPENCV_HAL_WRAP_NOT_OP(v_int32x8) - OPENCV_HAL_WRAP_NOT_OP(v_int64x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4) - #endif - #endif - #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ - inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a * b; \ - } \ template \ - inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_mul(f1 * f2, vf...); \ + inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \ + return v_mul(v_mul(f1, f2), f3, vf...); \ } OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) @@ -939,140 +864,6 @@ namespace CV__SIMD_NAMESPACE { #endif #endif - #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \ - inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a / b; \ - } - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4) - #endif - #endif - - #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \ - inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a op b; \ - } - #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \ - inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a == b; \ - } \ - inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a != b; \ - } - - #define OPENCV_HAL_WRAP_CMP(_Tpvec) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=) - - OPENCV_HAL_WRAP_CMP(v_uint8) - OPENCV_HAL_WRAP_CMP(v_uint16) - OPENCV_HAL_WRAP_CMP(v_uint32) - OPENCV_HAL_WRAP_EQ_OP(v_uint64) - OPENCV_HAL_WRAP_CMP(v_int8) - OPENCV_HAL_WRAP_CMP(v_int16) - OPENCV_HAL_WRAP_CMP(v_int32) - OPENCV_HAL_WRAP_EQ_OP(v_int64) - OPENCV_HAL_WRAP_CMP(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_CMP(v_uint8x16) - OPENCV_HAL_WRAP_CMP(v_uint16x8) - OPENCV_HAL_WRAP_CMP(v_uint32x4) - OPENCV_HAL_WRAP_EQ_OP(v_uint64x2) - OPENCV_HAL_WRAP_CMP(v_int8x16) - OPENCV_HAL_WRAP_CMP(v_int16x8) - OPENCV_HAL_WRAP_CMP(v_int32x4) - OPENCV_HAL_WRAP_EQ_OP(v_int64x2) - OPENCV_HAL_WRAP_CMP(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_CMP(v_uint8x32) - OPENCV_HAL_WRAP_CMP(v_uint16x16) - OPENCV_HAL_WRAP_CMP(v_uint32x8) - OPENCV_HAL_WRAP_EQ_OP(v_uint64x4) - OPENCV_HAL_WRAP_CMP(v_int8x32) - OPENCV_HAL_WRAP_CMP(v_int16x16) - OPENCV_HAL_WRAP_CMP(v_int32x8) - OPENCV_HAL_WRAP_EQ_OP(v_int64x4) - OPENCV_HAL_WRAP_CMP(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64x4) - #endif - #endif - - //////////// get0 //////////// - #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \ - inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \ - { \ - return v.get0(); \ - } - - OPENCV_HAL_WRAP_GRT0(v_uint8) - OPENCV_HAL_WRAP_GRT0(v_int8) - OPENCV_HAL_WRAP_GRT0(v_uint16) - OPENCV_HAL_WRAP_GRT0(v_int16) - OPENCV_HAL_WRAP_GRT0(v_uint32) - OPENCV_HAL_WRAP_GRT0(v_int32) - OPENCV_HAL_WRAP_GRT0(v_uint64) - OPENCV_HAL_WRAP_GRT0(v_int64) - OPENCV_HAL_WRAP_GRT0(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_GRT0(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_GRT0(v_uint8x16) - OPENCV_HAL_WRAP_GRT0(v_uint16x8) - OPENCV_HAL_WRAP_GRT0(v_uint32x4) - OPENCV_HAL_WRAP_GRT0(v_uint64x2) - OPENCV_HAL_WRAP_GRT0(v_int8x16) - OPENCV_HAL_WRAP_GRT0(v_int16x8) - OPENCV_HAL_WRAP_GRT0(v_int32x4) - OPENCV_HAL_WRAP_GRT0(v_int64x2) - OPENCV_HAL_WRAP_GRT0(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_GRT0(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_GRT0(v_uint8x32) - OPENCV_HAL_WRAP_GRT0(v_uint16x16) - OPENCV_HAL_WRAP_GRT0(v_uint32x8) - OPENCV_HAL_WRAP_GRT0(v_uint64x4) - OPENCV_HAL_WRAP_GRT0(v_int8x32) - OPENCV_HAL_WRAP_GRT0(v_int16x16) - OPENCV_HAL_WRAP_GRT0(v_int32x8) - OPENCV_HAL_WRAP_GRT0(v_int64x4) - OPENCV_HAL_WRAP_GRT0(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_GRT0(v_float64x4) - #endif - #endif - #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ { \ @@ -1142,74 +933,6 @@ namespace CV__SIMD_NAMESPACE { #endif //!CV_SIMD_SCALABLE -#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP) -// Compatibility layer for the backend that cleaned up. - #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ - template \ - inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_add(v_add(f1, f2), vf...); \ - } - - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) - #endif - - #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ - template \ - inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_mul(v_mul(f1, f2), vf...); \ - } - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) - #endif - - #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ - inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ - { \ - return v_extract_n::nlanes-1>(v); \ - } - - OPENCV_HAL_WRAP_EXTRACT(v_uint8) - OPENCV_HAL_WRAP_EXTRACT(v_int8) - OPENCV_HAL_WRAP_EXTRACT(v_uint16) - OPENCV_HAL_WRAP_EXTRACT(v_int16) - OPENCV_HAL_WRAP_EXTRACT(v_uint32) - OPENCV_HAL_WRAP_EXTRACT(v_int32) - OPENCV_HAL_WRAP_EXTRACT(v_uint64) - OPENCV_HAL_WRAP_EXTRACT(v_int64) - OPENCV_HAL_WRAP_EXTRACT(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_EXTRACT(v_float64) - #endif - - #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ - inline _Tpvec v_broadcast_highest(const _Tpvec& v) \ - { \ - return v_broadcast_element::nlanes-1>(v); \ - } - - OPENCV_HAL_WRAP_BROADCAST(v_uint32) - OPENCV_HAL_WRAP_BROADCAST(v_int32) - OPENCV_HAL_WRAP_BROADCAST(v_float32) - -#endif //CV_NEON - //! @cond IGNORED // backward compatibility diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index eed609f80e..e204050625 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -673,53 +673,51 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4) /** Arithmetics **/ #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } - -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64) - -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd) + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32, _mm256_adds_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32, _mm256_subs_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32, _mm256_adds_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32, _mm256_subs_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16, _mm256_adds_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16, _mm256_subs_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4, _mm256_sub_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4, _mm256_sub_epi64) + +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd) // saturating multiply 8-bit, 16-bit -inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b) { v_uint16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b) { v_int16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b) { __m256i pl = _mm256_mullo_epi16(a.val, b.val); __m256i ph = _mm256_mulhi_epu16(a.val, b.val); @@ -727,7 +725,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) __m256i p1 = _mm256_unpackhi_epi16(pl, ph); return v_uint16x16(_v256_packs_epu32(p0, p1)); } -inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b) { __m256i pl = _mm256_mullo_epi16(a.val, b.val); __m256i ph = _mm256_mulhi_epi16(a.val, b.val); @@ -735,14 +733,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) __m256i p1 = _mm256_unpackhi_epi16(pl, ph); return v_int16x16(_mm256_packs_epi32(p0, p1)); } -inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) -{ a = a * b; return a; } -inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) -{ a = a * b; return a; } -inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) -{ a = a * b; return a; } -inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \ @@ -833,13 +823,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, imm)); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -867,11 +857,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx /** Bitwise logic **/ -#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ +#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); } OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1)) @@ -900,29 +890,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd) /** Comparison **/ -#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } +#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \ + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m256i smask = _mm256_set1_##suffix(sbit); \ return _Tpuvec(_mm256_cmpgt_##suffix( \ _mm256_xor_si256(a.val, smask), \ _mm256_xor_si256(b.val, smask))); \ } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec) @@ -932,25 +922,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768) OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000) #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4) OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4) #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); } #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix) + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, suffix) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd) @@ -1216,9 +1206,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a) { return v_reduce_sum(v_reinterpret_as_s32(a)); } inline int v_reduce_sum(const v_int16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline float v_reduce_sum(const v_float32x8& a) { @@ -1273,27 +1263,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { v_uint32x8 l, h; - v_expand(v_add_wrap(a - b, b - a), l, h); - return v_reduce_sum(l + h); + v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) { v_uint32x8 l, h; v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) { - return v_reduce_sum(v_max(a, b) - v_min(a, b)); + return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 m = a < b; - return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); + v_int32x8 m = v_lt(a, b); + return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m))); } inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) { - return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); + return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))))); } /** Popcount **/ @@ -1308,15 +1298,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a) inline v_uint16x16 v_popcount(const v_uint16x16& a) { v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff)); } inline v_uint32x8 v_popcount(const v_uint32x8& a) { v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff)); } inline v_uint64x4 v_popcount(const v_uint64x4& a) { @@ -1408,9 +1398,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b*b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd) @@ -1419,7 +1409,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd) inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) @@ -1429,16 +1419,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x inline v_float32x8 v_invsqrt(const v_float32x8& x) { - v_float32x8 half = x * v256_setall_f32(0.5); + v_float32x8 half = v_mul(x, v256_setall_f32(0.5)); v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val)); // todo: _mm256_fnmsub_ps - t *= v256_setall_f32(1.5) - ((t * t) * half); + t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half))); return t; } inline v_float64x4 v_invsqrt(const v_float64x4& x) { - return v256_setall_f64(1.) / v_sqrt(x); + return v_div(v256_setall_f64(1.), v_sqrt(x)); } /** Absolute values **/ @@ -1451,23 +1441,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16) OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32) inline v_float32x8 v_abs(const v_float32x8& x) -{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); } +{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); } inline v_float64x4 v_abs(const v_float64x4& x) -{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); } +{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); } /** Absolute difference **/ inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b) { v_int8x32 d = v_sub_wrap(a, b); - v_int8x32 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x32 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) @@ -1475,26 +1465,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 d = a - b; - v_int32x8 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x8 d = v_sub(a, b); + v_int32x8 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) { - v_int8x32 d = a - b; - v_int8x32 m = a < b; - return (d ^ m) - m; + v_int8x32 d = v_sub(a, b); + v_int8x32 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1789,7 +1779,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec) inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); } inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) @@ -1799,7 +1789,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) return v_int64x4(_mm256_add_epi64(even, odd)); } inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) @@ -1816,7 +1806,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) return v_uint32x8(_mm256_add_epi32(prod0, prod1)); } inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) { @@ -1831,7 +1821,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) return v_int32x8(_mm256_add_epi32(prod0, prod1)); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) @@ -1855,7 +1845,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) )); } inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) { @@ -1871,13 +1861,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) )); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1923,7 +1913,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& return v_uint64x4(_mm256_add_epi64(p15_, p9d_)); } inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) { @@ -1934,7 +1924,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) return v_int64x4(_mm256_add_epi64(lo, hi)); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) @@ -1953,7 +1943,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1); v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2); v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, @@ -2058,43 +2048,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), - v_reinterpret_as_s16((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s16(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) { v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n))); } template inline v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(schar* ptr, const v_int16x16& a) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 32 @@ -2127,43 +2117,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b) { // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers. v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), - v_reinterpret_as_s32((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s32(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) { v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n))); } template inline v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(short* ptr, const v_int32x8& a) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 64 @@ -2192,28 +2182,28 @@ template inline v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) { v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) { v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) { v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(int* ptr, const v_int64x4& a) { v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // pack boolean diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index e59b8d92eb..64dab6b3ae 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -663,58 +663,56 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b) } #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } - -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64) - -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64) + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64) + +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64) /** Saturating arithmetics **/ -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16) - -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64, _mm512_adds_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64, _mm512_subs_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64, _mm512_adds_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64, _mm512_subs_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32, _mm512_adds_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32, _mm512_subs_epi16) + +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd) // saturating multiply -inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b) +inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b) { v_uint16x32 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b) +inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b) { v_int16x32 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b) +inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b) { __m512i pl = _mm512_mullo_epi16(a.val, b.val); __m512i ph = _mm512_mulhi_epu16(a.val, b.val); @@ -724,7 +722,7 @@ inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b) const __m512i m = _mm512_set1_epi32(65535); return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m))); } -inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b) +inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b) { __m512i pl = _mm512_mullo_epi16(a.val, b.val); __m512i ph = _mm512_mulhi_epi16(a.val, b.val); @@ -733,15 +731,6 @@ inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b) return v_int16x32(_mm512_packs_epi32(p0, p1)); } -inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b) -{ a = a * b; return a; } -inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b) -{ a = a * b; return a; } -inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b) -{ a = a * b; return a; } -inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b) -{ a = a * b; return a; } - inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); } inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); } @@ -802,13 +791,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b, /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -830,10 +819,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); } OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1)) @@ -865,16 +854,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd) /** Comparison **/ #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); } #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval) + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1) @@ -886,16 +875,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1) #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); } #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval) + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval) OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1) @@ -1250,9 +1239,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16) OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16) inline int v_reduce_sum(const v_int16x32& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline uint v_reduce_sum(const v_uint16x32& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \ inline sctype v_reduce_##func(const _Tpvec& a) \ @@ -1306,17 +1295,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b) return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); } inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b) -{ return v_reduce_sum(v_add_wrap(a - b, b - a)); } +{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); } inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b) { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); } inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b) -{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); } +{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b) -{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); } +{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); } inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b) -{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); } +{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); } inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b) -{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); } +{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); } /** Popcount **/ inline v_uint8x64 v_popcount(const v_int8x64& a) @@ -1351,8 +1340,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a) _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero)))); #else v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff)); #endif } inline v_uint32x16 v_popcount(const v_int32x16& a) @@ -1361,9 +1350,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a) return v_uint32x16(_mm512_popcnt_epi32(a.val)); #else v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff)); #endif } inline v_uint64x8 v_popcount(const v_int64x8& a) @@ -1403,9 +1392,9 @@ inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinte inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b * b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps) OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd) @@ -1413,7 +1402,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps) OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd) inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) -{ return a * b + c; } +{ return v_add(v_mul(a, b), c); } inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) { return v_fma(a, b, c); } @@ -1422,9 +1411,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x) #if CV_AVX_512ER return v_float32x16(_mm512_rsqrt28_ps(x.val)); #else - v_float32x16 half = x * v512_setall_f32(0.5); + v_float32x16 half = v_mul(x, v512_setall_f32(0.5)); v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val)); - t *= v512_setall_f32(1.5) - ((t * t) * half); + t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half))); return t; #endif } @@ -1434,7 +1423,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x) #if CV_AVX_512ER return v_float64x8(_mm512_rsqrt28_pd(x.val)); #else - return v512_setall_f64(1.) / v_sqrt(x); + return v_div(v512_setall_f64(1.), v_sqrt(x)); // v_float64x8 half = x * v512_setall_f64(0.5); // v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val)); // t *= v512_setall_f64(1.5) - ((t * t) * half); @@ -1482,17 +1471,17 @@ inline v_float64x8 v_abs(const v_float64x8& x) /** Absolute difference **/ inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b) { v_int8x64 d = v_sub_wrap(a, b); - v_int8x64 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x64 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b) @@ -1500,26 +1489,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b) inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b) { - v_int32x16 d = a - b; - v_int32x16 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x16 d = v_sub(a, b); + v_int32x16 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b) { - v_int8x64 d = a - b; - v_int8x64 m = a < b; - return (d ^ m) - m; + v_int8x64 d = v_sub(a, b); + v_int8x64 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1818,7 +1807,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec) inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b) { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); } inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) @@ -1828,7 +1817,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) return v_int64x8(_mm512_add_epi64(even, odd)); } inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) @@ -1844,7 +1833,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) return v_uint32x16(_mm512_add_epi32(prod0, prod1)); } inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) { @@ -1859,7 +1848,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) return v_int32x16(_mm512_add_epi32(prod0, prod1)); } inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) @@ -1883,7 +1872,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) )); } inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) { @@ -1893,13 +1882,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) return v_int64x8(_mm512_add_epi64(even, odd)); } inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1944,7 +1933,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& return v_uint64x8(_mm512_add_epi64(p15_, p9d_)); } inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b) { return v_dotprod_expand(a, b); } @@ -1955,7 +1944,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b) { return v_dotprod_expand(a, b); } inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \ @@ -1969,7 +1958,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v, v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1); v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2); v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x16 v_matmuladd(const v_float32x16& v, @@ -2070,43 +2059,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); - return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), - v_reinterpret_as_s16((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s16(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a) { v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); - v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n))); } template inline v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(schar* ptr, const v_int16x32& a) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 32 @@ -2139,43 +2128,43 @@ template inline v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b) { v_uint32x16 delta = v512_setall_u32(1 << (n-1)); - return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), - v_reinterpret_as_s32((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s32(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a) { v_uint32x16 delta = v512_setall_u32(1 << (n-1)); - v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n))); } template inline v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(short* ptr, const v_int32x16& a) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 64 @@ -2196,28 +2185,28 @@ template inline v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b) { v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a) { v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b) { v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(int* ptr, const v_int64x8& a) { v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // pack boolean diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index e364ba359b..ef1a33a630 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -225,32 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto Element-wise binary and unary operations. - Arithmetics: -@ref operator +(const v_reg &a, const v_reg &b) "+", -@ref operator -(const v_reg &a, const v_reg &b) "-", -@ref operator *(const v_reg &a, const v_reg &b) "*", -@ref operator /(const v_reg &a, const v_reg &b) "/", +@ref v_add(const v_reg &a, const v_reg &b) "+", +@ref v_sub(const v_reg &a, const v_reg &b) "-", +@ref v_mul(const v_reg &a, const v_reg &b) "*", +@ref v_div(const v_reg &a, const v_reg &b) "/", @ref v_mul_expand - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap - Bitwise shifts: -@ref operator <<(const v_reg &a, int s) "<<", -@ref operator >>(const v_reg &a, int s) ">>", +@ref v_shl(const v_reg &a, int s) "<<", +@ref v_shr(const v_reg &a, int s) ">>", @ref v_shl, @ref v_shr - Bitwise logic: -@ref operator &(const v_reg &a, const v_reg &b) "&", -@ref operator |(const v_reg &a, const v_reg &b) "|", -@ref operator ^(const v_reg &a, const v_reg &b) "^", -@ref operator ~(const v_reg &a) "~" +@ref v_and(const v_reg &a, const v_reg &b) "&", +@ref v_or(const v_reg &a, const v_reg &b) "|", +@ref v_xor(const v_reg &a, const v_reg &b) "^", +@ref v_not(const v_reg &a) "~" - Comparison: -@ref operator >(const v_reg &a, const v_reg &b) ">", -@ref operator >=(const v_reg &a, const v_reg &b) ">=", -@ref operator <(const v_reg &a, const v_reg &b) "<", -@ref operator <=(const v_reg &a, const v_reg &b) "<=", -@ref operator ==(const v_reg &a, const v_reg &b) "==", -@ref operator !=(const v_reg &a, const v_reg &b) "!=" +@ref v_gt(const v_reg &a, const v_reg &b) ">", +@ref v_ge(const v_reg &a, const v_reg &b) ">=", +@ref v_lt(const v_reg &a, const v_reg &b) "<", +@ref v_le(const v_reg &a, const v_reg &b) "<=", +@ref v_eq(const v_reg &a, const v_reg &b) "==", +@ref v_ne(const v_reg &a, const v_reg &b) "!=" - min/max: @ref v_min, @ref v_max @@ -573,50 +573,43 @@ enum { /** @brief Add values For all types. */ -template CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Subtract values For all types. */ -template CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Multiply values For 16- and 32-bit integer types and floating types. */ -template CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Divide values For floating types only. */ -template CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise AND Only for integer types. */ -template CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise OR Only for integer types. */ -template CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise XOR Only for integer types.*/ -template CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise NOT Only for integer types.*/ -template CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a); +template CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a); #ifndef CV_DOXYGEN @@ -639,33 +632,26 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \ CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \ CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \ -#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \ +#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \ template inline \ -v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ return c; \ -} \ -template inline \ -v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return a; \ } -#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op) +#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func) -CV__HAL_INTRIN_IMPL_BIN_OP(+) -CV__HAL_INTRIN_IMPL_BIN_OP(-) -CV__HAL_INTRIN_IMPL_BIN_OP(*) -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /) +CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add) +CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub) +CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul) +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div) -#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \ +#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \ template CV_INLINE \ -v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ @@ -673,29 +659,20 @@ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ return c; \ -} \ -template CV_INLINE \ -v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef typename V_TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return a; \ } -#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */ +#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */ -CV__HAL_INTRIN_IMPL_BIT_OP(&) -CV__HAL_INTRIN_IMPL_BIT_OP(|) -CV__HAL_INTRIN_IMPL_BIT_OP(^) +CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and) +CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or) +CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor) -#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \ +#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \ template CV_INLINE \ -v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ +v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ @@ -703,7 +680,7 @@ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ return c; \ } \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~) +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not) #endif // !CV_DOXYGEN @@ -760,7 +737,6 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$. */ OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) -#define OPENCV_HAL_MATH_HAVE_LOG 1 /** * @brief Error function. @@ -771,9 +747,7 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp) //! @cond IGNORED OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) -#define OPENCV_HAL_MATH_HAVE_SIN 1 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) -#define OPENCV_HAL_MATH_HAVE_COS 1 //! @endcond /** @brief Absolute value of elements @@ -897,9 +871,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ +#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \ template \ -inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ v_reg<_Tp, n> c; \ @@ -911,28 +885,28 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> /** @brief Less-than comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(<) +OPENCV_HAL_IMPL_CMP_OP(<, v_lt) /** @brief Greater-than comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(>) +OPENCV_HAL_IMPL_CMP_OP(>, v_gt) /** @brief Less-than or equal comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(<=) +OPENCV_HAL_IMPL_CMP_OP(<=, v_le) /** @brief Greater-than or equal comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(>=) +OPENCV_HAL_IMPL_CMP_OP(>=, v_ge) /** @brief Equal comparison */ -OPENCV_HAL_IMPL_CMP_OP(==) +OPENCV_HAL_IMPL_CMP_OP(==, v_eq) /** @brief Not equal comparison */ -OPENCV_HAL_IMPL_CMP_OP(!=) +OPENCV_HAL_IMPL_CMP_OP(!=, v_ne) template inline v_reg v_not_nan(const v_reg& a) @@ -1301,8 +1275,8 @@ template inline void v_hsum(const v_reg<_Tp, n>& a, //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ -template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \ +template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ @@ -1313,12 +1287,12 @@ template inline v_reg<_Tp, n> operator shift_op(const v_reg /** @brief Bitwise shift left For 16-, 32- and 64-bit integer values. */ -OPENCV_HAL_IMPL_SHIFT_OP(<< ) +OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl) /** @brief Bitwise shift right For 16-, 32- and 64-bit integer values. */ -OPENCV_HAL_IMPL_SHIFT_OP(>> ) +OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr) //! @brief Helper macro //! @ingroup core_hal_intrin_impl @@ -2942,7 +2916,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64) //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \ template inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \ -{ return a << shift; } +{ return v_shl(a, shift); } //! @name Left shift //! @{ @@ -2959,7 +2933,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64) //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \ template inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \ -{ return a >> shift; } +{ return v_shr(a, shift); } //! @name Right shift //! @{ @@ -3285,7 +3259,7 @@ inline v_reg v_matmuladd(const v_reg& v, template inline v_reg v_dotprod_expand(const v_reg& a, const v_reg& b) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); } template inline v_reg v_dotprod_expand(const v_reg& a, const v_reg& b, const v_reg& c) { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp index db491cc137..45f53de8a2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -746,53 +746,51 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4) /** Arithmetics **/ #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } - -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d) - -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d) + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32, __lasx_xvsadd_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32, __lasx_xvssub_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32, __lasx_xvsadd_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32, __lasx_xvssub_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16, __lasx_xvsadd_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16, __lasx_xvssub_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4, __lasx_xvsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4, __lasx_xvsub_d) + +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d) // saturating multiply 8-bit, 16-bit -inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b) { v_uint16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b) { v_int16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b) { __m256i pl = __lasx_xvmul_h(a.val, b.val); __m256i ph = __lasx_xvmuh_hu(a.val, b.val); @@ -800,7 +798,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) __m256i p1 = __lasx_xvilvh_h(ph, pl); return v_uint16x16(_v256_packs_epu32(p0, p1)); } -inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b) { __m256i pl = __lasx_xvmul_h(a.val, b.val); __m256i ph = __lasx_xvmuh_h(a.val, b.val); @@ -808,14 +806,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) __m256i p1 = __lasx_xvilvh_h(ph, pl); return v_int16x16(_lasx_packs_w(p0, p1)); } -inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) -{ a = a * b; return a; } -inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) -{ a = a * b; return a; } -inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) -{ a = a * b; return a; } -inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ @@ -904,13 +894,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -932,10 +922,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lasx_xvnori_b(a.val, 0)); } OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1)) @@ -948,16 +938,14 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1)) OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1)) #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); } OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps) @@ -983,25 +971,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const /** Comparison **/ #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \ } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \ OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec) @@ -1011,37 +999,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu) OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu) #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d) OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d) #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); } #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix) + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt, xvfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix) OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s) OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d) -inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b) +inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b) { return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); } -inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b) +inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b) { return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); } -inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b) +inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b) { return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); } -inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b) +inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b) { return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); } inline v_float32x8 v_not_nan(const v_float32x8& a) @@ -1309,9 +1297,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a) { return v_reduce_sum(v_reinterpret_as_s32(a)); } inline int v_reduce_sum(const v_int16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline float v_reduce_sum(const v_float32x8& a) { @@ -1379,27 +1367,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { v_uint32x8 l, h; - v_expand(v_add_wrap(a - b, b - a), l, h); - return v_reduce_sum(l + h); + v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) { v_uint32x8 l, h; v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) { - return v_reduce_sum(v_max(a, b) - v_min(a, b)); + return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 m = a < b; - return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); + v_int32x8 m = v_lt(a, b); + return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m))); } inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) { - v_float32x8 a_b = a - b; + v_float32x8 a_b = v_sub(a, b); return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff))); } @@ -1503,9 +1491,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b*b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s) OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d) @@ -1556,20 +1544,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) { return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); } inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) { - v_int8x32 d = a - b; - v_int8x32 m = a < b; - return (d ^ m) - m; + v_int8x32 d = v_sub(a, b); + v_int8x32 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1891,7 +1879,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) { return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); } inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) @@ -1915,7 +1903,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) return v_uint32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) { @@ -1926,7 +1914,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) return v_int32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) @@ -1938,7 +1926,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) return v_uint64x4(__lasx_xvadd_d(prod0, prod1)); } inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) { @@ -1950,13 +1938,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1993,7 +1981,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0))); } inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) { @@ -2004,7 +1992,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) return v_int64x4(__lasx_xvadd_d(lo, hi)); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) @@ -2024,7 +2012,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55); v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA); v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp index 6e3290426f..aa997070c3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp @@ -525,53 +525,51 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2) /** Arithmetics **/ #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } - -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d) - -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d) + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16, __lsx_vsadd_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16, __lsx_vssub_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16, __lsx_vsadd_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16, __lsx_vssub_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8, __lsx_vsadd_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8, __lsx_vssub_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8, __lsx_vsadd_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8, __lsx_vssub_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2, __lsx_vsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2, __lsx_vsub_d) + +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d) // saturating multiply 8-bit, 16-bit -inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b) +inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b) { v_uint16x8 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b) +inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b) { v_int16x8 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b) +inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b) { __m128i a0 = a.val, b0 = b.val; __m128i pev = __lsx_vmulwev_w_hu(a0, b0); @@ -580,7 +578,7 @@ inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b) __m128i ph = __lsx_vilvh_w(pod, pev); return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0); } -inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b) +inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b) { __m128i a0 = a.val, b0 = b.val; __m128i pev = __lsx_vmulwev_w_h(a0, b0); @@ -589,14 +587,6 @@ inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b) __m128i ph = __lsx_vilvh_w(pod, pev); return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0); } -inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b) -{ a = a * b; return a; } -inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b) -{ a = a * b; return a; } -inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b) -{ a = a * b; return a; } -inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ @@ -681,13 +671,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -708,10 +698,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \ - inline _Tpvec operator ~(const _Tpvec& a) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v) @@ -724,18 +714,14 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v) OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v) #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \ - a.val = cast(c); \ - return a;} + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \ OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps) @@ -760,23 +746,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const /** Comparison **/ #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~( a == b ); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a ; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } \ + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } \ #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \ OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec) @@ -786,37 +772,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu) OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu) #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d) OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d) #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \ #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt, vfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix) \ OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s) OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d) -inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b) +inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b) { return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); } -inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b) +inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b) { return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); } -inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b) +inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b) { return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); } -inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b) +inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b) { return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); } inline v_float32x4 v_not_nan(const v_float32x4& a) @@ -1188,7 +1174,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) { - v_float32x4 a_b = a - b; + v_float32x4 a_b = v_sub(a, b); return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff))); } @@ -1295,9 +1281,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b * b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s) OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d) @@ -1349,20 +1335,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ///////// Conversions ///////// @@ -1673,7 +1659,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c ;} +{ return v_add(v_dotprod_expand(a, b), c) ;} inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -1685,7 +1671,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) return v_int32x4(__lsx_vadd_w(prod0, prod1)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -1698,7 +1684,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) return v_uint64x2(__lsx_vadd_d(prod0, prod1)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1710,13 +1696,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(__lsx_vadd_d(prod0, prod1)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } ///////// Fast Dot Product ////// @@ -1755,7 +1741,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1))); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) { @@ -1767,7 +1753,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) return v_int64x2(__lsx_vadd_d(lo, hi)); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp index 23d6ebd3d1..8d2c22b087 100644 --- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -345,53 +345,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64) -OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \ -inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ -} \ -inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ -{a = a * b; return a; } +} OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8) @@ -546,13 +539,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(msa_hadd_s64(prod, prod)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -596,10 +589,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, { return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix) \ +inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \ } @@ -614,21 +607,16 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64) OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64) #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \ { \ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32) -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32) -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32) -inline v_float32x4 operator ~ (const v_float32x4& a) +inline v_float32x4 v_not(const v_float32x4& a) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); } @@ -659,21 +647,16 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64) OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64) #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \ -inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \ { \ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \ -} \ -inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ -{ \ - a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64) -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64) -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64) -inline v_float64x2 operator ~ (const v_float64x2& a) +inline v_float64x2 v_not(const v_float64x2& a) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); } @@ -704,17 +687,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64) OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64) #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); } OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8) @@ -821,9 +804,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_ // trade efficiency for convenience #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ +inline _Tpvec v_shl(const _Tpvec& a, int n) \ { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ +inline _Tpvec v_shr(const _Tpvec& a, int n) \ { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \ diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp index 5681ae211d..4900418df3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -373,70 +373,50 @@ inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ } #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val, num)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val, num); \ - return a; \ -} - -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4) -inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) -{ - return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4)); } -inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) + +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4) +inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b) { - a.val = vfdiv_vv_f32m1(a.val, b.val, 4); - return a; + return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4)); } -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2) -inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2) +inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b) { return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2)); } -inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b) -{ - a.val = vfdiv_vv_f64m1(a.val, b.val, 2); - return a; -} // TODO: exp, log, sin, cos #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \ @@ -562,10 +542,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_ } #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \ - inline _Tpvec operator ~ (const _Tpvec & a) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \ + inline _Tpvec v_not(const _Tpvec & a) \ { \ return _Tpvec(vnot_v_##suffix(a.val, num)); \ } @@ -580,41 +560,31 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4) OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2) #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \ { \ return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \ - return a; \ } -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1) -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1) -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1) -inline v_float32x4 operator ~ (const v_float32x4& a) +inline v_float32x4 v_not(const v_float32x4& a) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4))); } #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \ -inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \ { \ return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \ -} \ -inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ -{ \ - a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \ - return a; \ } -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1) -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1) -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1) -inline v_float64x2 operator ~ (const v_float64x2& a) +inline v_float64x2 v_not(const v_float64x2& a) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2))); } @@ -1174,32 +1144,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4) OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4) #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ @@ -1215,37 +1185,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_) OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_) //TODO: == -inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); @@ -1259,37 +1229,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a) } //TODO: == -inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); @@ -1331,13 +1301,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32) #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ +inline _Tpvec v_shl(const _Tpvec& a, int n) \ { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ +inline _Tpvec v_shr(const _Tpvec& a, int n) \ { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \ template inline _Tpvec v_shr(const _Tpvec& a) \ { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\ @@ -2037,13 +2007,11 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ auto res = mul(a.val, b.val, num); \ return _Tpvec(cvt(res, 0, num)); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1) OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1) @@ -2845,7 +2813,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) { vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4); @@ -2854,7 +2822,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) { v_float64x2 res = v_dotprod_expand_fast(a, b); - return res + c; } + return v_add(res, c); } #endif ////// FP16 support /////// #if __riscv_v == 7000 diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 68b5a67bbc..f4761c96b4 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -735,53 +735,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ - } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { \ - a.val = intrin(a.val, b.val); \ - return a; \ - } - -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) + } + +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8) OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8) @@ -845,7 +838,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { return v_int32x4(_mm_madd_epi16(a.val, b.val)); } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) @@ -872,7 +865,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) #endif } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) @@ -886,7 +879,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) return v_uint32x4(_mm_add_epi32(p0, p1)); } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -899,7 +892,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) return v_int32x4(_mm_add_epi32(p0, p1)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -911,14 +904,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) v_expand(c, c0, c1); v_expand(d, d0, d1); - c0 += c1; d0 += d1; + c0 = v_add(c0, c1); d0 = v_add(d0, d1); return v_uint64x2(_mm_add_epi64( _mm_unpacklo_epi64(c0.val, d0.val), _mm_unpackhi_epi64(c0.val, d0.val) )); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -931,7 +924,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) )); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) @@ -939,8 +932,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) #if CV_SSE4_1 return v_cvt_f64(v_dotprod(a, b)); #else - v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b); - v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b); + v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b)); + v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b)); return v_float64x2(_mm_add_pd( _mm_unpacklo_pd(c.val, d.val), @@ -949,7 +942,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) #endif } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -957,13 +950,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) { return v_dotprod(a, b); } inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) { return v_dotprod(a, b); } inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod_fast(a, b) + c; } +{ return v_add(v_dotprod_fast(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) @@ -977,7 +970,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b return v_uint32x4(_mm_add_epi32(p0, p1)); } inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) { @@ -994,7 +987,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) #endif } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) @@ -1006,34 +999,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b v_expand(c, c0, c1); v_expand(d, d0, d1); - c0 += c1; d0 += d1; - return c0 + d0; + c0 = v_add(c0, c1); d0 = v_add(d0, d1); + return v_add(c0, d0); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) { v_int32x4 prod = v_dotprod(a, b); v_int64x2 c, d; v_expand(prod, c, d); - return c + d; + return v_add(c, d); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c); inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ } @@ -1182,58 +1175,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) } #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ -inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ } \ -inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ } \ -inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ __m128i not_mask = _mm_set1_epi32(-1); \ __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ return _Tpuvec(_mm_xor_si128(res, not_mask)); \ } \ -inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ __m128i not_mask = _mm_set1_epi32(-1); \ __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ return _Tpuvec(_mm_xor_si128(res, not_mask)); \ } \ -inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \ { \ return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ } \ -inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { \ return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ } \ -inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ @@ -1244,17 +1237,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) @@ -1262,17 +1255,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) #if CV_SSE4_1 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return ~(a == b); } +inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \ +{ return v_not(v_eq(a, b)); } #else #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \ return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return ~(a == b); } +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ +{ return v_not(v_eq(a, b)); } #endif OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2) @@ -1311,17 +1304,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) /** Absolute difference **/ inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) { v_int8x16 d = v_sub_wrap(a, b); - v_int8x16 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x16 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { @@ -1329,25 +1322,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { - v_int32x4 d = a - b; - v_int32x4 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x4 d = v_sub(a, b); + v_int32x4 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) @@ -1381,12 +1374,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - _Tpvec res = v_fma(a, a, b*b); \ + _Tpvec res = v_fma(a, a, v_mul(b, b)); \ return _Tpvec(_mm_sqrt_##suffix(res.val)); \ } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - return v_fma(a, a, b*b); \ + return v_fma(a, a, v_mul(b, b)); \ } \ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ { \ @@ -1397,19 +1390,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32(( OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ } \ -inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ } \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ } \ -inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(srai(a.val, imm)); \ } \ @@ -1711,9 +1704,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) inline int v_reduce_sum(const v_int16x8& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x8& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline uint64 v_reduce_sum(const v_uint64x2& a) { @@ -1770,13 +1763,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) { @@ -1805,15 +1798,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a) inline v_uint16x8 v_popcount(const v_uint16x8& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff)); } inline v_uint32x4 v_popcount(const v_uint32x4& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff)); } inline v_uint64x2 v_popcount(const v_uint64x2& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index e66563bede..fbe690461a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -513,48 +513,44 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) /* Element-wise binary and unary operations */ /** Arithmetics **/ #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(intrin(a.val, b.val)); } \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ a.val = intrin(a.val, b.val); return a; } - -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub) +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub) // saturating multiply #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8) @@ -596,9 +592,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul) /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \ -inline _Tpvec operator << (const _Tpvec& a, int imm) \ +inline _Tpvec v_shl(const _Tpvec& a, int imm) \ { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int imm) \ +inline _Tpvec v_shr(const _Tpvec& a, int imm) \ { return _Tpvec(shr(a.val, splfunc(imm))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ @@ -617,10 +613,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor) \ +inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(vec_not(a.val)); } OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16) @@ -650,17 +646,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c) /** Comparison **/ #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpeq(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec V_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpne(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmplt(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec V_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpgt(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmple(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpge(a.val, b.val)); } OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16) @@ -1060,7 +1056,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4) OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2) inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) -{ return a * b + c; } +{ return v_add(v_mul(a, b), c); } // TODO: exp, log, sin, cos @@ -1089,12 +1085,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) -{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); } +{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Absolute difference for signed integers **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) @@ -1442,7 +1438,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) return v_int64x2(vec_add(even, odd)); } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) @@ -1485,7 +1481,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) return v_uint64x2(vec_add(s0, s1)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1495,13 +1491,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val))); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1531,7 +1527,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z))); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) @@ -1544,10 +1540,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) v_int32x4 prod = v_dotprod(a, b); v_int64x2 c, d; v_expand(prod, c, d); - return c + d; + return v_add(c, d); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp index 5d470d9419..3a8069ca91 100644 --- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp @@ -849,53 +849,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ -} - -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div) +} + +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint8x16, wasm_u8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint8x16, wasm_u8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int8x16, wasm_i8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int8x16, wasm_i8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint16x8, wasm_u16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint16x8, wasm_u16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int16x8, wasm_i16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int16x8, wasm_i16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_uint32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_int32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float32x4, wasm_f32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float32x4, wasm_f32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float32x4, wasm_f32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float32x4, wasm_f32x4_div) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float64x2, wasm_f64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float64x2, wasm_f64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float64x2, wasm_f64x2_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float64x2, wasm_f64x2_div) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \ -inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ -} \ -inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ -{ a = a * b; return a; } +} OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8) OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8) @@ -986,7 +979,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { @@ -1000,7 +993,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) { - return v_dotprod(a, b) + c; + return v_add(v_dotprod(a, b), c); } // 8 >> 32 @@ -1010,13 +1003,13 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) v128_t a1 = wasm_u16x8_shr(a.val, 8); v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8); v128_t b1 = wasm_u16x8_shr(b.val, 8); - return v_uint32x4(( - v_dotprod(v_int16x8(a0), v_int16x8(b0)) + - v_dotprod(v_int16x8(a1), v_int16x8(b1))).val + return v_uint32x4((v_add( + v_dotprod(v_int16x8(a0), v_int16x8(b0)), + v_dotprod(v_int16x8(a1), v_int16x8(b1)))).val ); } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -1024,13 +1017,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) v128_t a1 = wasm_i16x8_shr(a.val, 8); v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8); v128_t b1 = wasm_i16x8_shr(b.val, 8); - return v_int32x4( - v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + return v_int32x4(v_add( + v_dotprod(v_int16x8(a0), v_int16x8(b0)), v_dotprod(v_int16x8(a1), v_int16x8(b1)) - ); + )); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -1039,13 +1032,13 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) v128_t a1 = wasm_u32x4_shr(a.val, 16); v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16); v128_t b1 = wasm_u32x4_shr(b.val, 16); - return v_uint64x2(( - v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + return v_uint64x2((v_add( + v_dotprod(v_int32x4(a0), v_int32x4(b0)), v_dotprod(v_int32x4(a1), v_int32x4(b1))).val - ); + )); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1053,20 +1046,20 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) v128_t a1 = wasm_i32x4_shr(a.val, 16); v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16); v128_t b1 = wasm_i32x4_shr(b.val, 16); - return v_int64x2(( - v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + return v_int64x2((v_add( + v_dotprod(v_int32x4(a0), v_int32x4(b0)), v_dotprod(v_int32x4(a1), v_int32x4(b1))) - ); + )); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1109,10 +1102,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, { return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_and, _Tpvec, wasm_v128_and) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_or, _Tpvec, wasm_v128_or) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_xor, _Tpvec, wasm_v128_xor) \ +inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(wasm_v128_not(a.val)); \ } @@ -1215,17 +1208,17 @@ OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000) OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000) #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); } OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16) @@ -1238,10 +1231,10 @@ OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4) OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2) #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ -{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); } +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_eq(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_ne(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64) OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64) @@ -1299,17 +1292,17 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul) /** Absolute difference **/ inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) { v_int8x16 d = v_sub_wrap(a, b); - v_int8x16 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x16 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { @@ -1317,25 +1310,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { - v_int32x4 d = a - b; - v_int32x4 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x4 d = v_sub(a, b); + v_int32x4 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) @@ -1345,12 +1338,12 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) @@ -1386,19 +1379,19 @@ OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4) OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2) #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \ } \ -inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \ } \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \ } \ -inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \ } \ @@ -1694,7 +1687,7 @@ inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) v_expand(v_absdiff(a, b), l16, h16); v_expand(l16, l16_l32, l16_h32); v_expand(h16, h16_l32, h16_h32); - return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); + return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32))); } inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) { @@ -1703,19 +1696,19 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) v_expand(v_absdiff(a, b), l16, h16); v_expand(l16, l16_l32, l16_h32); v_expand(h16, h16_l32, h16_h32); - return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); + return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32))); } inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) { @@ -1744,15 +1737,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a) inline v_uint16x8 v_popcount(const v_uint16x8& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff)); } inline v_uint32x4 v_popcount(const v_uint32x4& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff)); } inline v_uint64x2 v_popcount(const v_uint64x2& a) { diff --git a/modules/features2d/src/fast.avx2.cpp b/modules/features2d/src/fast.avx2.cpp index 72e7d66924..3d408a03df 100644 --- a/modules/features2d/src/fast.avx2.cpp +++ b/modules/features2d/src/fast.avx2.cpp @@ -157,7 +157,7 @@ public: q0 = v_max(q0, v_min(a, v0_)); q1 = v_min(q1, v_max(b, v0_)); } - q0 = v_max(q0, v_setzero_s16() - q1); + q0 = v_max(q0, v_sub(v_setzero_s16(), q1)); curr[j + k] = (uchar)(v_reduce_max(q0) - 1); } } diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 229b7f3ca5..e490380f84 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -2618,8 +2618,8 @@ public: v_uint32 r0, r1, r2, r3; v_expand(vx_load(S0), r0, r1); v_expand(vx_load(S1), r2, r3); - r0 += r2; r1 += r3; - v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0)); + r0 = v_add(r0, r2); r1 = v_add(r1, r3); + v_rshr_pack_store<2>(D, v_add(r0, v_rotate_left<1>(r1, r0))); } #else v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3))); diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp index 208ffc1231..b4aafeaea2 100644 --- a/modules/imgproc/src/sumpixels.simd.hpp +++ b/modules/imgproc/src/sumpixels.simd.hpp @@ -130,9 +130,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_expand(el8, el4l, el4h); @@ -188,11 +188,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_expand(el8_1, el4l_1, el4h_1); @@ -350,9 +350,9 @@ struct Integral_SIMD prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31); #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_expand(el8, el4l, el4h); @@ -364,7 +364,7 @@ struct Integral_SIMD prev = v_combine_high(el4h, el4h); #else v_int32 t = v_rotate_right<12>(el4h); - t |= v_rotate_left<4>(t); + t = v_or(t, v_rotate_left<4>(t)); prev = v_combine_low(t, t); #endif #endif @@ -442,9 +442,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -501,11 +501,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; @@ -590,13 +590,13 @@ struct Integral_SIMD el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); - el8_3 += v_rotate_left<4>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); - el8_3 += v_rotate_left<8>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; @@ -663,9 +663,9 @@ struct Integral_SIMD prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31); #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -678,7 +678,7 @@ struct Integral_SIMD prev = v_combine_high(el4h, el4h); #else v_float32 t = v_rotate_right<12>(el4h); - t |= v_rotate_left<4>(t); + t = v_or(t, v_rotate_left<4>(t)); prev = v_combine_low(t, t); #endif #endif @@ -770,9 +770,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -843,11 +843,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; @@ -958,13 +958,13 @@ struct Integral_SIMD el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); - el8_3 += v_rotate_left<4>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); - el8_3 += v_rotate_left<8>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; @@ -1058,9 +1058,9 @@ struct Integral_SIMD prev_1.val = prev_2.val = el4hh.val; #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi;