diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 68867a25d0..d45f327beb 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -53,6 +53,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD) list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(APPEND CPU_ALL_OPTIMIZATIONS RVV) +list(APPEND CPU_ALL_OPTIMIZATIONS LSX) list(APPEND CPU_ALL_OPTIMIZATIONS LASX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -397,10 +398,16 @@ elseif(RISCV) set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}") elseif(LOONGARCH64) + ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp") ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp") - ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX") + ocv_update(CPU_LSX_FLAGS_ON "-mlsx") ocv_update(CPU_LASX_FLAGS_ON "-mlasx") - set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}") + if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX") + set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}") + else() + set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}") + endif() endif() diff --git a/cmake/checks/cpu_lsx.cpp b/cmake/checks/cpu_lsx.cpp new file mode 100644 index 0000000000..86c2e5c7a6 --- /dev/null +++ b/cmake/checks/cpu_lsx.cpp @@ -0,0 +1,15 @@ +#include +#include + +int test() +{ + const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f}; + v4f32 val = (v4f32)__lsx_vld((const float*)(src), 0); + return __lsx_vpickve2gr_w(__lsx_vftint_w_s(val), 3); +} + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 3235b6317e..de7b84b82a 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -172,6 +172,11 @@ # define CV_MSA 1 #endif +#ifdef CV_CPU_COMPILE_LSX +# include +# define CV_LSX 1 +#endif + #ifdef CV_CPU_COMPILE_LASX # include # define CV_LASX 1 @@ -376,6 +381,10 @@ struct VZeroUpperGuard { # define CV_RVV 0 #endif +#ifndef CV_LSX +# define CV_LSX 0 +#endif + #ifndef CV_LASX # define CV_LASX 0 #endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 41fc9d50fa..daca592e65 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -525,6 +525,27 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX +# define CV_TRY_LSX 1 +# define CV_CPU_FORCE_LSX 1 +# define CV_CPU_HAS_SUPPORT_LSX 1 +# define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX +# define CV_TRY_LSX 1 +# define CV_CPU_FORCE_LSX 0 +# define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX)) +# define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args) +# define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args) +#else +# define CV_TRY_LSX 0 +# define CV_CPU_FORCE_LSX 0 +# define CV_CPU_HAS_SUPPORT_LSX 0 +# define CV_CPU_CALL_LSX(fn, args) +# define CV_CPU_CALL_LSX_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...) CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX # define CV_TRY_LASX 1 # define CV_CPU_FORCE_LASX 1 diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 9793c0ba68..99cd66f8cc 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -281,7 +281,8 @@ namespace cv { #define CV_CPU_RVV 210 -#define CV_CPU_LASX 230 +#define CV_CPU_LSX 230 +#define CV_CPU_LASX 231 // CPU features groups #define CV_CPU_AVX512_SKX 256 @@ -342,7 +343,8 @@ enum CpuFeatures { CPU_RVV = 210, - CPU_LASX = 230, + CPU_LSX = 230, + CPU_LASX = 231, CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512 diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 904b05e405..3968cba8f0 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -206,7 +206,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_RVV #endif -#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP) +#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP) #define CV__SIMD_FORWARD 128 #include "opencv2/core/hal/intrin_forward.hpp" #endif @@ -242,6 +242,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #include "opencv2/core/hal/intrin_rvv.hpp" #endif +#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP) + +#include "opencv2/core/hal/intrin_lsx.hpp" + #elif CV_LASX #if !defined(CV_FORCE_SIMD128_CPP) #define CV_FORCE_SIMD128_CPP 1 diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp index 37f2e3f81d..5214e80743 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -96,54 +96,22 @@ inline __m256d _v256_setall_pd(double f64) inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b) { - __m256i u8min = __lasx_xvreplgr2vr_h(0); - __m256i u8max = __lasx_xvreplgr2vr_h(255); - __m256i sat_a = __lasx_xvmax_h(a, u8min); - sat_a = __lasx_xvmin_h(sat_a, u8max); - __m256i sat_b = __lasx_xvmax_h(b, u8min); - sat_b = __lasx_xvmin_h(sat_b, u8max); - __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - return __lasx_xvshuf_b(sat_b, sat_a, byteIndex); + return __lasx_xvssrarni_bu_h(b, a, 0); } inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b) { - __m256i s8min = __lasx_xvreplgr2vr_h(-128); - __m256i s8max = __lasx_xvreplgr2vr_h(127); - __m256i sat_a = __lasx_xvmax_h(a, s8min); - sat_a = __lasx_xvmin_h(sat_a, s8max); - __m256i sat_b = __lasx_xvmax_h(b, s8min); - sat_b = __lasx_xvmin_h(sat_b, s8max); - __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - return __lasx_xvshuf_b(sat_b, sat_a, byteIndex); + return __lasx_xvssrarni_b_h(b, a, 0); } inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b) { - __m256i u16min = __lasx_xvreplgr2vr_w(0); - __m256i u16max = __lasx_xvreplgr2vr_w(0xffff); - __m256i sat_a = __lasx_xvmax_w(a, u16min); - sat_a = __lasx_xvmin_w(sat_a, u16max); - __m256i sat_b = __lasx_xvmax_w(b, u16min); - sat_b = __lasx_xvmin_w(sat_b, u16max); - __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14, - 0, 2, 4, 6, 8, 10, 12, 14); - return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a); + return __lasx_xvssrarni_hu_w(b, a, 0); } inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b) { - __m256i s16min = __lasx_xvreplgr2vr_w(-0x8000); - __m256i s16max = __lasx_xvreplgr2vr_w(0x7fff); - __m256i sat_a = __lasx_xvmax_w(a, s16min); - sat_a = __lasx_xvmin_w(sat_a, s16max); - __m256i sat_b = __lasx_xvmax_w(b, s16min); - sat_b = __lasx_xvmin_w(sat_b, s16max); - __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14, - 0, 2, 4, 6, 8, 10, 12, 14); - return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a); + return __lasx_xvssrarni_h_w(b, a, 0); } inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi) @@ -191,7 +159,7 @@ inline _Tpvec v256_permute4x64(const _Tpvec& a) { return _Tpvec(_v256_permute4x64(a.val)); } inline __m128i _v256_extract_high(const __m256i& v) -{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31); +{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E); return *((__m128i*)&temp256i); } inline __m128 _v256_extract_high(const __m256& v) @@ -211,10 +179,7 @@ inline __m128d _v256_extract_low(const __m256d& v) inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b) { - const __m256i maxv = __lasx_xvreplgr2vr_w(65535); - __m256i am = __lasx_xvmin_wu(a, maxv); - __m256i bm = __lasx_xvmin_wu(b, maxv); - return _lasx_packus_w(am, bm); + return __lasx_xvssrlrni_hu_w(b, a, 0); } template @@ -869,14 +834,11 @@ OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h) inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b) { - __m256i ad = __lasx_xvsrai_h(a.val, 8); - __m256i bd = __lasx_xvsrai_h(b.val, 8); - __m256i p0 = __lasx_xvmul_h(a.val, b.val); - __m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8); - - const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00); - return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01)); + __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val); + __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val); + return v_uint8x32(__lasx_xvpackev_b(p1, p0)); } + inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b) { return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); @@ -963,14 +925,7 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h) OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w) - -inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift) -{ - __m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63); - __m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift); - return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift)); -} -OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, _v256_srai_dx) +OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d) /** Bitwise logic **/ @@ -979,7 +934,7 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, _v256_srai_dx) OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \ OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \ inline _Tpvec operator ~ (const _Tpvec& a) \ - { return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); } + { return _Tpvec(__lasx_xvnori_b(a.val, 0)); } OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1)) OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1)) @@ -1224,11 +1179,9 @@ inline v_int8x32 v_reverse(const v_int8x32 &a) inline v_uint16x16 v_reverse(const v_uint16x16 &a) { - static const __m256i perm = _v256_setr_b( - 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, - 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm); - return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1)); + __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B); + vec = __lasx_xvshuf4i_w(vec, 0x4E); + return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E)); } inline v_int16x16 v_reverse(const v_int16x16 &a) @@ -1236,8 +1189,8 @@ inline v_int16x16 v_reverse(const v_int16x16 &a) inline v_uint32x8 v_reverse(const v_uint32x8 &a) { - static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0); - return v_uint32x8(__lasx_xvperm_w(a.val, perm)); + __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B); + return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E)); } inline v_int32x8 v_reverse(const v_int32x8 &a) @@ -1266,17 +1219,19 @@ inline unsigned v_reduce_sum(const v_uint8x32& a) __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val); __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); - return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); + __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3); + return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]); } + inline int v_reduce_sum(const v_int8x32& a) { __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val); __m256i t2 = __lasx_xvhaddw_w_h(t1, t1); __m256i t3 = __lasx_xvhaddw_d_w(t2, t2); - return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]); + __m256i t4 = __lasx_xvhaddw_q_d(t3, t3); + return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]); } - #define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \ inline sctype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1344,7 +1299,8 @@ OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s) inline int v_reduce_sum(const v_int32x8& a) { __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val); - return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]); + __m256i t2 = __lasx_xvhaddw_q_d(t1, t1); + return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]); } inline unsigned v_reduce_sum(const v_uint32x8& a) @@ -1367,13 +1323,13 @@ inline float v_reduce_sum(const v_float32x8& a) inline uint64 v_reduce_sum(const v_uint64x4& a) { - uint64 *pa = (uint64*)&a; - return pa[0] + pa[1] + pa[2] + pa[3]; + __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val); + return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]); } inline int64 v_reduce_sum(const v_int64x4& a) { - int64 *pa = (int64*)&a; - return pa[0] + pa[1] + pa[2] + pa[3]; + __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val); + return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]); } inline double v_reduce_sum(const v_float64x4& a) { @@ -1406,7 +1362,8 @@ inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0); __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); - return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); + __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3); + return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]); } inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) { @@ -1414,7 +1371,8 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0); __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); - return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); + __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3); + return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]); } inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { @@ -1445,36 +1403,13 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) /** Popcount **/ inline v_uint8x32 v_popcount(const v_uint8x32& a) -{ - __m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F); - return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)), - __lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask)))); -} +{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); } inline v_uint16x16 v_popcount(const v_uint16x16& a) -{ - v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff)); -} +{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); } inline v_uint32x8 v_popcount(const v_uint32x8& a) -{ - v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff)); -} +{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); } inline v_uint64x4 v_popcount(const v_uint64x4& a) -{ - v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a)); - uint8_t *pa = (uint8_t*)&atemp; - uint64 v[4]; - for (int i = 0; i < 4; ++i) { - v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7]; - } - return v_uint64x4(v[0], v[1], v[2], v[3]); -} +{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); } inline v_uint8x32 v_popcount(const v_int8x32& a) { return v_popcount(v_reinterpret_as_u8(a)); } inline v_uint16x16 v_popcount(const v_int16x16& a) @@ -1500,10 +1435,9 @@ OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64) inline int v_signmask(const v_int8x32& a) { - int mask = 0; - int8_t *pa = (int8_t*)&a; - for( int i = 0; i < 32; i++ ) - mask |= (reinterpret_int(pa[i]) < 0) << i; + __m256i result = __lasx_xvmskltz_b(a.val); + int mask = __lasx_xvpickve2gr_w(result, 0); + mask |= (__lasx_xvpickve2gr_w(result, 4) << 16); return mask; } inline int v_signmask(const v_uint8x32& a) @@ -1516,10 +1450,9 @@ inline int v_signmask(const v_uint16x16& a) inline int v_signmask(const v_int32x8& a) { - int mask = 0; - int *pa = (int*)&a; - for( int i = 0; i < 8; i++ ) - mask |= (pa[i] < 0) << i; + __m256i result = __lasx_xvmskltz_w(a.val); + int mask = __lasx_xvpickve2gr_w(result, 0); + mask |= (__lasx_xvpickve2gr_w(result, 4) << 4); return mask; } inline int v_signmask(const v_uint32x8& a) @@ -1527,10 +1460,9 @@ inline int v_signmask(const v_uint32x8& a) inline int v_signmask(const v_int64x4& a) { - int mask = 0; - int64 *pa = (int64*)&a; - for( int i = 0; i < 4; i++ ) - mask |= (pa[i] < 0) << i; + __m256i result = __lasx_xvmskltz_d(a.val); + int mask = __lasx_xvpickve2gr_d(result, 0); + mask |= (__lasx_xvpickve2gr_w(result, 4) << 2); return mask; } inline int v_signmask(const v_uint64x4& a) @@ -1592,7 +1524,7 @@ OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d) inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) { - return a * b + c; + return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val)); } inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) @@ -1601,17 +1533,10 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x } inline v_float32x8 v_invsqrt(const v_float32x8& x) -{ - v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5); - v_float32x8 t = v_float32x8(__lasx_xvfrsqrt_s(x.val)); - t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half); - return t; -} +{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); } inline v_float64x4 v_invsqrt(const v_float64x4& x) -{ - return v_float64x4(1., 1., 1., 1.) / v_sqrt(x); -} +{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); } /** Absolute values **/ #define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \ @@ -1629,28 +1554,18 @@ inline v_float64x4 v_abs(const v_float64x4& x) /** Absolute difference **/ inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b) -{ return v_add_wrap(a - b, b - a); } +{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); } inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); } inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); } inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b) -{ - v_int8x32 d = v_sub_wrap(a, b); - v_int8x32 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); -} - +{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); } inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) -{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } - +{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); } inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) -{ - v_int32x8 d = a - b; - v_int32x8 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); -} +{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); } inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) { return v_abs(a - b); } @@ -1740,28 +1655,8 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh)); } -// from (Mysticial and wim) https://stackoverflow.com/q/41144668 inline v_float64x4 v_cvt_f64(const v_int64x4& v) -{ - // constants encoded as floating-point - __m256i magic_i_lo = __lasx_xvreplgr2vr_d(0x4330000000000000); - __m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000); - __m256i magic_i_all = __lasx_xvreplgr2vr_d(0x4530000080100000); - __m256d magic_d_all = _lasx_256_castsi256_pd(magic_i_all); - - // Blend the 32 lowest significant bits of v with magic_int_lo - __m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1); - __m256i v_lo = __lasx_xvbitsel_v(magic_i_lo, v.val, mask); - // Extract the 32 most significant bits of v - __m256i v_hi = __lasx_xvsrli_d(v.val, 32); - // Flip the msb of v_hi and blend with 0x45300000 - v_hi = __lasx_xvxor_v(v_hi, magic_i_hi32); - // Compute in double precision - __m256d v_hi_dbl = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all); - // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition - __m256d result = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo)); - return v_float64x4(result); -} +{ return v_float64x4(__lasx_xvffint_d_l(v.val)); } ////////////// Lookup table access //////////////////// @@ -1967,11 +1862,9 @@ inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) inline v_int8x32 v_pack_triplets(const v_int8x32& vec) { __m256i vzero = __lasx_xvreplgr2vr_w(0); - __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val, - _v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100)); - __m256i t2 = __lasx_xvshuf_b(vzero, t1, - _v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100)); - return v_int8x32(__lasx_xvperm_w(t2, + __m256i t1 = __lasx_xvshuf_b(vzero, vec.val, + _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100)); + return v_int8x32(__lasx_xvperm_w(t1, _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); } inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) @@ -1980,11 +1873,9 @@ inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) inline v_int16x16 v_pack_triplets(const v_int16x16& vec) { __m256i vzero = __lasx_xvreplgr2vr_w(0); - __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val, - _v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100)); - __m256i t2 = __lasx_xvshuf_b(vzero, t1, - _v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100)); - return v_int16x16(__lasx_xvperm_w(t2, + __m256i t1 = __lasx_xvshuf_b(vzero, vec.val, + _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100)); + return v_int16x16(__lasx_xvperm_w(t1, _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); } inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) @@ -2018,24 +1909,21 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) { __m256i even = __lasx_xvmulwev_d_w(a.val, b.val); - __m256i odd = __lasx_xvmulwod_d_w(a.val, b.val); - return v_int64x4(__lasx_xvadd_d(even, odd)); + return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val)); } inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) -{ return v_dotprod(a, b) + c; } +{ + __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val); + return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val)); +} // 8 >> 32 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) { - __m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00); - __m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m); - __m256i odd_a = __lasx_xvsrli_h(a.val, 8); - - __m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m); - __m256i odd_b = __lasx_xvsrli_h(b.val, 8); - - __m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b)); - __m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b)); + __m256i even = __lasx_xvmulwev_h_bu(a.val, b.val); + __m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val); + __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even); + __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd); return v_uint32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) @@ -2043,14 +1931,10 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, con inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) { - __m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8); - __m256i odd_a = __lasx_xvsrai_h(a.val, 8); - - __m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8); - __m256i odd_b = __lasx_xvsrai_h(b.val, 8); - - __m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b)); - __m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b)); + __m256i even = __lasx_xvmulwev_h_b(a.val, b.val); + __m256i odd = __lasx_xvmulwod_h_b(a.val, b.val); + __m256i prod0 = __lasx_xvhaddw_w_h(even, even); + __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd); return v_int32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) @@ -2059,36 +1943,24 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const // 16 >> 64 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) { - __m256i mullo = __lasx_xvmul_h(a.val, b.val); - __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val); - __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo); - __m256i mul1 = __lasx_xvilvh_h(mulhi, mullo); - - __m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); - __m256i p13 = __lasx_xvsrli_d(mul0, 32); - __m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); - __m256i p57 = __lasx_xvsrli_d(mul1, 32); - - __m256i p15_ = __lasx_xvadd_d(p02, p13); - __m256i p9d_ = __lasx_xvadd_d(p46, p57); - - return v_uint64x4(__lasx_xvadd_d( - __lasx_xvilvl_d(p9d_, p15_), - __lasx_xvilvh_d(p9d_, p15_))); + __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val); + __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val); + __m256i prod0 = __lasx_xvhaddw_du_wu(even, even); + __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd); + return v_uint64x4(__lasx_xvadd_d(prod0, prod1)); } inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) { return v_dotprod_expand(a, b) + c; } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) { - __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val)); - __m256i sign = __lasx_xvsrai_w(prod, 31); - - __m256i lo = __lasx_xvilvl_w(sign, prod); - __m256i hi = __lasx_xvilvh_w(sign, prod); - - return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo))); + __m256i even = __lasx_xvmulwev_w_h(a.val, b.val); + __m256i odd = __lasx_xvmulwod_w_h(a.val, b.val); + __m256i prod0 = __lasx_xvhaddw_d_w(even, even); + __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd); + return v_int64x4(__lasx_xvadd_d(prod0, prod1)); } + inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) { return v_dotprod_expand(a, b) + c; } @@ -2126,20 +1998,11 @@ inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, c // 16 >> 64 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b) { - __m256i mullo = __lasx_xvmul_h(a.val, b.val); - __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val); - __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo); - __m256i mul1 = __lasx_xvilvh_h(mulhi, mullo); - - __m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); - __m256i p13 = __lasx_xvsrli_d(mul0, 32); - __m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); - __m256i p57 = __lasx_xvsrli_d(mul1, 32); - - __m256i p15_ = __lasx_xvadd_d(p02, p13); - __m256i p9d_ = __lasx_xvadd_d(p46, p57); - - return v_uint64x4(__lasx_xvadd_d(p15_, p9d_)); + __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val); + __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val); + __m256i prod0 = __lasx_xvhaddw_du_wu(even, even); + __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd); + return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0))); } inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) { return v_dotprod_expand_fast(a, b) + c; } @@ -2261,12 +2124,7 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b) { return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); } inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b) -{ - __m256i t = __lasx_xvreplgr2vr_h(255); - __m256i a1 = __lasx_xvmin_hu(a.val, t); - __m256i b1 = __lasx_xvmin_hu(b.val, t); - return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1))); -} +{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); } inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) { @@ -2276,13 +2134,8 @@ inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) inline void v_pack_store(schar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack(a, a)); } -inline void v_pack_store(uchar* ptr, const v_uint16x16& a) -{ - const __m256i m = __lasx_xvreplgr2vr_h(255); - __m256i am = __lasx_xvmin_hu(a.val, m); - am = _v256_shuffle_odd_64(_lasx_packus_h(am, am)); - v_store_low(ptr, v_uint8x32(am)); -} +inline void v_pack_store(uchar *ptr, const v_uint16x16& a) +{ v_store_low(ptr, v_pack(a, a)); } inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack_u(a, a)); } @@ -2290,45 +2143,43 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) template inline v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b) { - // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. - v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), - v_reinterpret_as_s16((b + delta) >> n)); + __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n); + return v_uint8x32(_v256_shuffle_odd_64(res)); } template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) { - v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); + __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } template inline v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b) { - v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n); + return v_uint8x32(_v256_shuffle_odd_64(res)); } template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) { - v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_u_store(ptr, (a + delta) >> n); + __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } template inline v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b) { - v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack((a + delta) >> n, (b + delta) >> n); + __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n); + return v_int8x32(_v256_shuffle_odd_64(res)); } template inline void v_rshr_pack_store(schar* ptr, const v_int16x16& a) { - v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_store(ptr, (a + delta) >> n); + __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } // 32 @@ -2346,67 +2197,51 @@ inline void v_pack_store(short* ptr, const v_int32x8& a) inline void v_pack_store(ushort* ptr, const v_uint32x8& a) { - const __m256i m = __lasx_xvreplgr2vr_w(65535); - __m256i am = __lasx_xvmin_wu(a.val, m); - am = _v256_shuffle_odd_64(_lasx_packus_w(am, am)); - v_store_low(ptr, v_uint16x16(am)); + __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) { v_store_low(ptr, v_pack_u(a, a)); } - template inline v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b) -{ - // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers. - v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), - v_reinterpret_as_s32((b + delta) >> n)); -} +{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); } template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) { - v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); + __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } template inline v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b) -{ - v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack_u((a + delta) >> n, (b + delta) >> n); -} +{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) { - v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_u_store(ptr, (a + delta) >> n); + __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } template inline v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b) -{ - v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); -} +{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); } template inline void v_rshr_pack_store(short* ptr, const v_int32x8& a) { - v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n); + __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); } // 64 // Non-saturating pack inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b) { - __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08); - __m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08); - __m256i ab = __lasx_xvilvl_d(b0, a0); + __m256i ab = __lasx_xvpickev_w(b.val, a.val); return v_uint32x8(_v256_shuffle_odd_64(ab)); } @@ -2424,31 +2259,19 @@ inline void v_pack_store(int* ptr, const v_int64x4& b) template inline v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) -{ - v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); -} +{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); } template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) -{ - v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); -} +{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); } template inline v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) -{ - v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); -} +{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); } template inline void v_rshr_pack_store(int* ptr, const v_int64x4& a) -{ - v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); -} +{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); } // pack boolean inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b) @@ -2583,63 +2406,48 @@ template inline v_float32x8 v_broadcast_element(const v_float32x8 &a) { return v_reinterpret_as_f32(v_broadcast_element(v_reinterpret_as_u32(a))); } - ///////////////////// load deinterleave ///////////////////////////// -inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b ) +inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b) { - __m256i ab0 = __lasx_xvld(ptr, 0); - __m256i ab1 = __lasx_xvld(ptr + 32, 0); - - const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh); - __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh); - __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); - __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); - __m256i a0 = __lasx_xvilvl_d(ph, pl); - __m256i b0 = __lasx_xvilvh_d(ph, pl); - a = v_uint8x32(a0); - b = v_uint8x32(b0); + __m256i t0 = __lasx_xvld(ptr, 0); + __m256i t1 = __lasx_xvld(ptr, 32); + + __m256i p0 = __lasx_xvpickev_b(t1, t0); + __m256i p1 = __lasx_xvpickod_b(t1, t0); + + a.val = __lasx_xvpermi_d(p0, 0xd8); + b.val = __lasx_xvpermi_d(p1, 0xd8); } inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b ) { - __m256i ab0 = __lasx_xvld(ptr, 0); - __m256i ab1 = __lasx_xvld(ptr + 16, 0); - - const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh); - __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh); - __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); - __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); - __m256i a0 = __lasx_xvilvl_d(ph, pl); - __m256i b0 = __lasx_xvilvh_d(ph, pl); - a = v_uint16x16(a0); - b = v_uint16x16(b0); + __m256i t0 = __lasx_xvld(ptr, 0); + __m256i t1 = __lasx_xvld(ptr, 32); + + __m256i p0 = __lasx_xvpickev_h(t1, t0); + __m256i p1 = __lasx_xvpickod_h(t1, t0); + + a.val = __lasx_xvpermi_d(p0, 0xd8); + b.val = __lasx_xvpermi_d(p1, 0xd8); } inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b ) { - __m256i ab0 = __lasx_xvld(ptr, 0); - __m256i ab1 = __lasx_xvld(ptr + 8, 0); + __m256i t0 = __lasx_xvld(ptr, 0); + __m256i t1 = __lasx_xvld(ptr, 32); - //const int sh = 0+2*4+1*16+3*64; - __m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8); - __m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8); - __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); - __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); - __m256i a0 = __lasx_xvilvl_d(ph, pl); - __m256i b0 = __lasx_xvilvh_d(ph, pl); - a = v_uint32x8(a0); - b = v_uint32x8(b0); + __m256i p0 = __lasx_xvpickev_w(t1, t0); + __m256i p1 = __lasx_xvpickod_w(t1, t0); + + a.val = __lasx_xvpermi_d(p0, 0xd8); + b.val = __lasx_xvpermi_d(p1, 0xd8); } inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b ) { __m256i ab0 = __lasx_xvld(ptr, 0); - __m256i ab1 = __lasx_xvld(ptr + 4, 0); + __m256i ab1 = __lasx_xvld(ptr, 32); __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02); __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13); @@ -2652,8 +2460,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c ) { __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 32, 0); - __m256i bgr2 = __lasx_xvld(ptr + 64, 0); + __m256i bgr1 = __lasx_xvld(ptr, 32); + __m256i bgr2 = __lasx_xvld(ptr, 64); __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); @@ -2686,8 +2494,8 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c ) { __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 16, 0); - __m256i bgr2 = __lasx_xvld(ptr + 32, 0); + __m256i bgr1 = __lasx_xvld(ptr, 32); + __m256i bgr2 = __lasx_xvld(ptr, 64); __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); @@ -2717,8 +2525,8 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c ) { __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 8, 0); - __m256i bgr2 = __lasx_xvld(ptr + 16, 0); + __m256i bgr1 = __lasx_xvld(ptr, 32); + __m256i bgr2 = __lasx_xvld(ptr, 64); __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); @@ -2741,8 +2549,8 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c ) { __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 4, 0); - __m256i bgr2 = __lasx_xvld(ptr + 8, 0); + __m256i bgr1 = __lasx_xvld(ptr, 32); + __m256i bgr2 = __lasx_xvld(ptr, 64); __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128 __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12); @@ -2756,81 +2564,60 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b c = v_uint64x4(r0); } -inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d ) +inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d) { - __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 32, 0); - __m256i bgr2 = __lasx_xvld(ptr + 64, 0); - __m256i bgr3 = __lasx_xvld(ptr + 96, 0); - const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + __m256i t0 = __lasx_xvld(ptr, 0); + __m256i t1 = __lasx_xvld(ptr, 32); + __m256i t2 = __lasx_xvld(ptr, 64); + __m256i t3 = __lasx_xvld(ptr, 96); - __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh); - __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh); - __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh); - __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh); + const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7); + __m256i ac_lo = __lasx_xvpickev_b(t1, t0); + __m256i bd_lo = __lasx_xvpickod_b(t1, t0); + __m256i ac_hi = __lasx_xvpickev_b(t3, t2); + __m256i bd_hi = __lasx_xvpickod_b(t3, t2); - __m256i p01l = __lasx_xvilvl_w(p1, p0); - __m256i p01h = __lasx_xvilvh_w(p1, p0); - __m256i p23l = __lasx_xvilvl_w(p3, p2); - __m256i p23h = __lasx_xvilvh_w(p3, p2); - - __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02); - __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13); - __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02); - __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13); - - __m256i b0 = __lasx_xvilvl_w(plh, pll); - __m256i g0 = __lasx_xvilvh_w(plh, pll); - __m256i r0 = __lasx_xvilvl_w(phh, phl); - __m256i a0 = __lasx_xvilvh_w(phh, phl); + __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo); + __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo); + __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo); + __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo); - a = v_uint8x32(b0); - b = v_uint8x32(g0); - c = v_uint8x32(r0); - d = v_uint8x32(a0); + a.val = __lasx_xvperm_w(a_pre, sh); + b.val = __lasx_xvperm_w(b_pre, sh); + c.val = __lasx_xvperm_w(c_pre, sh); + d.val = __lasx_xvperm_w(d_pre, sh); } -inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d ) +inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d) { - __m256i bgr0 = __lasx_xvld(ptr, 0); - __m256i bgr1 = __lasx_xvld(ptr + 16, 0); - __m256i bgr2 = __lasx_xvld(ptr + 32, 0); - __m256i bgr3 = __lasx_xvld(ptr + 48, 0); - const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh); - __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh); - __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh); - __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh); + __m256i t0 = __lasx_xvld(ptr, 0); + __m256i t1 = __lasx_xvld(ptr, 32); + __m256i t2 = __lasx_xvld(ptr, 64); + __m256i t3 = __lasx_xvld(ptr, 96); - __m256i p01l = __lasx_xvilvl_w(p1, p0); - __m256i p01h = __lasx_xvilvh_w(p1, p0); - __m256i p23l = __lasx_xvilvl_w(p3, p2); - __m256i p23h = __lasx_xvilvh_w(p3, p2); + const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7); + __m256i ac_lo = __lasx_xvpickev_h(t1, t0); + __m256i bd_lo = __lasx_xvpickod_h(t1, t0); + __m256i ac_hi = __lasx_xvpickev_h(t3, t2); + __m256i bd_hi = __lasx_xvpickod_h(t3, t2); - __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02); - __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13); - __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02); - __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13); + __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo); + __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo); + __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo); + __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo); - __m256i b0 = __lasx_xvilvl_w(plh, pll); - __m256i g0 = __lasx_xvilvh_w(plh, pll); - __m256i r0 = __lasx_xvilvl_w(phh, phl); - __m256i a0 = __lasx_xvilvh_w(phh, phl); - - a = v_uint16x16(b0); - b = v_uint16x16(g0); - c = v_uint16x16(r0); - d = v_uint16x16(a0); + a.val = __lasx_xvperm_w(a_pre, sh); + b.val = __lasx_xvperm_w(b_pre, sh); + c.val = __lasx_xvperm_w(c_pre, sh); + d.val = __lasx_xvperm_w(d_pre, sh); } inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d ) { __m256i p0 = __lasx_xvld(ptr, 0); - __m256i p1 = __lasx_xvld(ptr + 8, 0); - __m256i p2 = __lasx_xvld(ptr + 16, 0); - __m256i p3 = __lasx_xvld(ptr + 24, 0); + __m256i p1 = __lasx_xvld(ptr, 32); + __m256i p2 = __lasx_xvld(ptr, 64); + __m256i p3 = __lasx_xvld(ptr, 96); __m256i p01l = __lasx_xvilvl_w(p1, p0); __m256i p01h = __lasx_xvilvh_w(p1, p0); @@ -2856,9 +2643,9 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d ) { __m256i bgra0 = __lasx_xvld(ptr, 0); - __m256i bgra1 = __lasx_xvld(ptr + 4, 0); - __m256i bgra2 = __lasx_xvld(ptr + 8, 0); - __m256i bgra3 = __lasx_xvld(ptr + 12, 0); + __m256i bgra1 = __lasx_xvld(ptr, 32); + __m256i bgra2 = __lasx_xvld(ptr, 64); + __m256i bgra3 = __lasx_xvld(ptr, 96); __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02); __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13); diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp new file mode 100644 index 0000000000..4e3d2319ad --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp @@ -0,0 +1,2536 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef OPENCV_HAL_INTRIN_LSX_HPP +#define OPENCV_HAL_INTRIN_LSX_HPP + +#include + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 +#define CV_SIMD128_FP16 0 + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +/////////// Utils //////// + +inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, + char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15) +{ + return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15 }; +} + +inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, + char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15) +{ + return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8, + v7, v6, v5, v4, v3, v2, v1, v0 }; +} + +inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, + short v6, short v7) +{ + return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 }; +} + +inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3) +{ + return (__m128i)v4i32{ v0, v1, v2, v3 }; +} + +inline __m128i _v128_set_w(int v0, int v1, int v2, int v3) +{ + return (__m128i)v4i32{ v3, v2, v1, v0 }; +} + +inline __m128i _v128_setall_w(int v0) +{ + return __lsx_vreplgr2vr_w(v0); +} + +inline __m128i _v128_setr_d(int64 v0, int64 v1) +{ + return (__m128i)v2i64{ v0, v1 }; +} + +inline __m128i _v128_set_d(int64 v0, int64 v1) +{ + return (__m128i)v2i64{ v1, v0 }; +} + +inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3) +{ + return (__m128)v4f32{ v0, v1, v2, v3 }; +} + +inline __m128 _v128_setall_ps(float v0) +{ + return (__m128)v4f32{ v0, v0, v0, v0 }; +} + +inline __m128d _v128_setr_pd(double v0, double v1) +{ + return (__m128d)v2f64{ v0, v1 }; +} + +inline __m128d _v128_setall_pd(double v0) +{ + return (__m128d)v2f64{ v0, v0 }; +} + +inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b) +{ + return __lsx_vssrarni_bu_h(b, a, 0); +} + +inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b) +{ + return __lsx_vssrarni_b_h(b, a, 0); +} + +inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b) +{ + return __lsx_vssrarni_hu_w(b, a, 0); +} + +/////// Types /////// + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16}; + + v_uint8x16() {} + explicit v_uint8x16(__m128i v): val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + uchar get0() const + { + return (uchar)__lsx_vpickve2gr_bu(val, 0); + } + + __m128i val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(__m128i v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + schar get0() const + { + return (schar)__lsx_vpickve2gr_b(val, 0); + } + + __m128i val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(__m128i v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7); + } + + ushort get0() const + { + return (ushort)__lsx_vpickve2gr_hu(val, 0); + } + + __m128i val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(__m128i v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7); + } + + short get0() const + { + return (short)__lsx_vpickve2gr_h(val, 0); + } + + __m128i val; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(__m128i v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + val = _v128_setr_w(v0, v1, v2, v3); + } + + unsigned get0() const + { + return (unsigned)__lsx_vpickve2gr_wu(val, 0); + } + + __m128i val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(__m128i v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + val = _v128_setr_w(v0, v1, v2, v3); + } + + int get0() const + { + return (int)__lsx_vpickve2gr_w(val, 0); + } + + __m128i val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4}; + + v_float32x4() {} + explicit v_float32x4(__m128 v) : val(v) {} + explicit v_float32x4(__m128i v) { val = *((__m128*)&v); } + v_float32x4(float v0, float v1, float v2, float v3) + { + val = _v128_setr_ps(v0, v1, v2, v3); + } + + float get0() const + { + union { int iv; float fv; } d; + d.iv = __lsx_vpickve2gr_w(val, 0); + return d.fv; + } + + int get0toint() const + { + __m128i result = __lsx_vftintrz_w_s(val); + return (int)__lsx_vpickve2gr_w(result, 0); + } + + __m128 val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2}; + + v_uint64x2() {} + explicit v_uint64x2(__m128i v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + val = _v128_setr_d(v0, v1); + } + + uint64 get0() const + { + return __lsx_vpickve2gr_du(val, 0); + } + + __m128i val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2}; + + v_int64x2() {} + explicit v_int64x2(__m128i v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + val = _v128_setr_d(v0, v1); + } + + uint64 get0() const + { + return __lsx_vpickve2gr_d(val, 0); + } + + __m128i val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2}; + + v_float64x2() {} + explicit v_float64x2(__m128d v) : val(v) {} + explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); } + v_float64x2(double v0, double v1) + { + val = _v128_setr_pd(v0, v1); + } + + double get0() const + { + union { int64 iv; double fv; } d; + d.iv = __lsx_vpickve2gr_d(val, 0); + return d.fv; + } + + int64 get0toint64() const + { + __m128i result = __lsx_vftintrz_l_d(val); + return (int64)__lsx_vpickve2gr_d(result, 0); + } + + __m128d val; +}; + +////////////// Load and store operations ///////// + +#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp) \ + inline _Tpvec v_load(const _Tp* ptr) \ + { return _Tpvec(__lsx_vld(ptr, 0)); } \ + inline _Tpvec v_load_aligned(const _Tp* ptr) \ + { return _Tpvec(__lsx_vld(ptr, 0)); } \ + inline _Tpvec v_load_low(const _Tp* ptr) \ + { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); } \ + inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + __m128i vl = __lsx_vldrepl_d(ptr0, 0); \ + __m128i vh = __lsx_vldrepl_d(ptr1, 0); \ + return _Tpvec(__lsx_vilvl_d(vh, vl)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(a.val, ptr, 0); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(a.val, ptr, 0); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(a.val, ptr, 0); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\ + { \ + if ( mode == hal::STORE_UNALIGNED) \ + __lsx_vst(a.val, ptr, 0); \ + else if ( mode == hal::STORE_ALIGNED_NOCACHE) \ + __lsx_vst(a.val, ptr, 0); \ + else \ + __lsx_vst(a.val, ptr, 0); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vstelm_d(a.val, ptr, 0, 0); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vstelm_d(a.val, ptr, 0, 1); } \ + +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16, uchar) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16, schar) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8, short) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4, int) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2, uint64) +OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2, int64) + +#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \ + inline _Tpvec v_load(const _Tp* ptr) \ + { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \ + inline _Tpvec v_load_aligned(const _Tp* ptr) \ + { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \ + inline _Tpvec v_load_low(const _Tp* ptr) \ + { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); } \ + inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + __m128i vl = __lsx_vldrepl_d(ptr0, 0); \ + __m128i vh = __lsx_vldrepl_d(ptr1, 0); \ + return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst((__m128i)a.val, ptr, 0); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst((__m128i)a.val, ptr, 0); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst((__m128i)a.val, ptr, 0); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\ + { \ + if( mode == hal::STORE_UNALIGNED) \ + __lsx_vst((__m128i)a.val, ptr, 0); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE) \ + __lsx_vst((__m128i)a.val, ptr, 0); \ + else \ + __lsx_vst((__m128i)a.val, ptr, 0); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); } \ + +OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128) +OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d) + +inline __m128i _lsx_128_castps_si128(const __m128& v) +{ return __m128i(v); } + +inline __m128i _lsx_128_castpd_si128(const __m128d& v) +{ return __m128i(v); } + +#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast) \ + inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \ + { return _Tpvec(cast(a.val)); } + +#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \ + inline _Tpvec v_setzero_##suffix() \ + { return _Tpvec(__lsx_vldi(0)); } \ + inline _Tpvec v_setall_##suffix(_Tp v) \ + { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128) \ + +OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16, uchar, u8, b, int) +OPENCV_HAL_IMPL_LSX_INIT(v_int8x16, schar, s8, b, int) +OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8, ushort, u16, h, int) +OPENCV_HAL_IMPL_LSX_INIT(v_int16x8, short, s16, h, int) +OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4, unsigned, u32, w, int) +OPENCV_HAL_IMPL_LSX_INIT(v_int32x4, int, s32, w, int) +OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2, uint64, u64, d, long int) +OPENCV_HAL_IMPL_LSX_INIT(v_int64x2, int64, s64, d, long int) + +inline __m128 _lsx_128_castsi128_ps(const __m128i &v) +{ return __m128(v); } + +inline __m128d _lsx_128_castsi128_pd(const __m128i &v) +{ return __m128d(v); } + +#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \ + inline _Tpvec v_setzero_##suffix() \ + { return _Tpvec(__lsx_vldi(0)); } \ + inline _Tpvec v_setall_##suffix(_Tp v) \ + { return _Tpvec(_v128_setall_##zsuffix(v)); } \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, cast) \ + OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, cast) \ + +OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float, f32, ps, _lsx_128_castsi128_ps) +OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd) + +inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) +{ return a; } +inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) +{ return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); } + +inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) +{ return a; } +inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) +{ return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); } + +//////////////// Variant Value reordering /////////////// + +// unpacks +#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix) \ + inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); } \ + inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); } \ + +OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16, b) +OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16, b) +OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8, h) +OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8, h) +OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4, w) +OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4, w) +OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2, d) +OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2, d) +OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w) +OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d) + +//ZIP +#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec) \ + inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ + { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \ + inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ + { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \ + inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ + { \ + __m128i a1 = (__m128i)a.val, b1 = (__m128i)b.val; \ + c = _Tpvec(__lsx_vilvl_d(b1, a1)); \ + d = _Tpvec(__lsx_vilvh_d(b1, a1)); \ + } \ + inline void v_zip(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& ab0, _Tpvec& ab1) \ + { \ + ab0 = v128_unpacklo(a, b); \ + ab1 = v128_unpackhi(a, b); \ + } + +OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16) +OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16) +OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8) +OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8) +OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4) +OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4) +OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2) +OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2) +OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4) +OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2) + +////////// Arithmetic, bitwise and comparison operations ///////// + +/** Arithmetics **/ +#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { a.val = intrin(a.val, b.val); return a; } + +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d) + +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d) + +// saturating multiply 8-bit, 16-bit +inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b) +{ + v_uint16x8 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b) +{ + v_int16x8 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i pev = __lsx_vmulwev_w_hu(a0, b0); + __m128i pod = __lsx_vmulwod_w_hu(a0, b0); + __m128i pl = __lsx_vilvl_w(pod, pev); + __m128i ph = __lsx_vilvh_w(pod, pev); + return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0); +} +inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i pev = __lsx_vmulwev_w_h(a0, b0); + __m128i pod = __lsx_vmulwod_w_h(a0, b0); + __m128i pl = __lsx_vilvl_w(pod, pev); + __m128i ph = __lsx_vilvh_w(pod, pev); + return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0); +} +inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b) +{ a = a * b; return a; } +inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b) +{ a = a * b; return a; } +inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b) +{ a = a * b; return a; } +inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b) +{ a = a * b; return a; } + +/** Non-saturating arithmetics **/ + +#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin) \ + inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } \ + +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16, __lsx_vadd_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16, __lsx_vadd_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8, __lsx_vadd_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8, __lsx_vadd_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16, __lsx_vsub_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16, __lsx_vsub_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8, __lsx_vsub_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8, __lsx_vsub_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8, __lsx_vmul_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8, __lsx_vmul_h) + +inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_h_bu(a0, b0); + __m128i p1 = __lsx_vmulwod_h_bu(a0, b0); + return v_uint8x16(__lsx_vpackev_b(p1, p0)); +} + +inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) +{ + return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); +} + +// Multiply and expand +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_h_bu(a0, b0); + __m128i p1 = __lsx_vmulwod_h_bu(a0, b0); + c.val = __lsx_vilvl_h(p1, p0); + d.val = __lsx_vilvh_h(p1, p0); +} +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_h_b(a0, b0); + __m128i p1 = __lsx_vmulwod_h_b(a0, b0); + c.val = __lsx_vilvl_h(p1, p0); + d.val = __lsx_vilvh_h(p1, p0); +} +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_w_h(a0, b0); + __m128i p1 = __lsx_vmulwod_w_h(a0, b0); + c.val = __lsx_vilvl_w(p1, p0); + d.val = __lsx_vilvh_w(p1, p0); +} +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_w_hu(a0, b0); + __m128i p1 = __lsx_vmulwod_w_hu(a0, b0); + c.val = __lsx_vilvl_w(p1, p0); + d.val = __lsx_vilvh_w(p1, p0); +} +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + __m128i a0 = a.val, b0 = b.val; + __m128i p0 = __lsx_vmulwev_d_wu(a0, b0); + __m128i p1 = __lsx_vmulwod_d_wu(a0, b0); + c.val = __lsx_vilvl_d(p1, p0); + d.val = __lsx_vilvh_d(p1, p0); +} +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ return v_int16x8(__lsx_vmuh_h(a.val, b.val)); } +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); } + +/** Bitwise shifts **/ +#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ + inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ + inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ + inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ + inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ + template \ + inline _Tpuvec v_shl(const _Tpuvec& a) \ + { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shl(const _Tpsvec& a) \ + { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); } \ + template \ + inline _Tpuvec v_shr(const _Tpuvec& a) \ + { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shr(const _Tpsvec& a) \ + { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); } \ + +OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h) +OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w) +OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d) + +/** Bitwise logic **/ +#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \ + inline _Tpvec operator ~(const _Tpvec& a) \ + { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \ + +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v) +OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v) + +#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \ + a.val = cast(c); \ + return a;} + +#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \ + +OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps) +OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd) + +/** Select **/ +#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec) \ + inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); } \ + +OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16) +OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16) +OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8) +OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8) +OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4) +OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4) + +inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b) +{ return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); } +inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b) +{ return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); } + +/** Comparison **/ +#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~( a == b ); } \ + inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ + { return b > a ; } \ + inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a < b); } \ + inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ + { return b >= a; } \ + +#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ + inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \ + inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \ + inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \ + inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \ + OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \ + OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec) + +OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16, v_int8x16, b, bu) +OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu) +OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu) + +#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \ + inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a == b); } + +OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d) +OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d) + +#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \ + +#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \ + +OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s) +OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d) + +inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b) +{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); } + +inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b) +{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); } + +inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b) +{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); } + +inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b) +{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); } + +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); } + +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); } + +/** min/max **/ +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16, __lsx_vmin_bu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16, __lsx_vmax_bu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16, __lsx_vmin_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16, __lsx_vmax_b) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8, __lsx_vmin_hu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8, __lsx_vmax_hu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8, __lsx_vmin_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8, __lsx_vmax_h) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4, __lsx_vmin_wu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4, __lsx_vmax_wu) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4, __lsx_vmin_w) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4, __lsx_vmax_w) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d) +OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d) + +template 16)), + bool is_first = (imm == 0), + bool is_half = (imm == 8), + bool is_second = (imm == 16), + bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))> +class v_lsx_palignr_u8_class; + +template +class v_lsx_palignr_u8_class; + +template +class v_lsx_palignr_u8_class +{ +public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + return a; + } +}; + +template +class v_lsx_palignr_u8_class +{ +public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + return __lsx_vshuf4i_d(a, b, 0x9); + } +}; + +template +class v_lsx_palignr_u8_class +{ +public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + return b; + } +}; + +template +class v_lsx_palignr_u8_class +{ +public: + inline __m128i operator()(const __m128i& a, const __m128i& b) const + { + enum { imm2 = (sizeof(__m128i) - imm) }; + return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2)); + } +}; + +template +inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b) +{ + CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8"); + return v_lsx_palignr_u8_class()(a, b); +} +/** Rotate **/ +#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast) \ + template \ + inline _Tpvec v_rotate_right(const _Tpvec &a) \ + { \ + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \ + __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2); \ + return _Tpvec(cast(ret)); \ + } \ + template \ + inline _Tpvec v_rotate_left(const _Tpvec &a) \ + { \ + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \ + __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2); \ + return _Tpvec(cast(ret)); \ + } \ + template \ + inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ + { \ + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \ + return _Tpvec(cast(v_lsx_palignr_u8((__m128i)a.val, (__m128i)b.val))); \ + } \ + template \ + inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ + { \ + enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))}; \ + return _Tpvec(cast(v_lsx_palignr_u8((__m128i)b.val, (__m128i)a.val))); \ + } + +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP) \ +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2, OPENCV_HAL_NOP) \ + +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps) +OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd) + +/** Rverse **/ +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ + __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B); + return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B)); +} + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ + __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B); + return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E)); +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); } + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); } + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + +////////////// Reduce and mask //////////// + +/** Reduce **/ +// this function is return a[0]+a[1]+...+a[31] +inline unsigned v_reduce_sum(const v_uint8x16& a) +{ + __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val); + __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1); + __m128i t3 = __lsx_vhaddw_du_wu(t2, t2); + __m128i t4 = __lsx_vhaddw_qu_du(t3, t3); + return (unsigned)__lsx_vpickve2gr_w(t4, 0); +} + +inline int v_reduce_sum(const v_int8x16 &a) +{ + __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val); + __m128i t2 = __lsx_vhaddw_w_h(t1, t1); + __m128i t3 = __lsx_vhaddw_d_w(t2, t2); + __m128i t4 = __lsx_vhaddw_q_d(t3, t3); + return (int)__lsx_vpickve2gr_w(t4, 0); +} + +#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \ + val = intrin(val, __lsx_vbsrl_v(val, 4)); \ + val = intrin(val, __lsx_vbsrl_v(val, 2)); \ + val = intrin(val, __lsx_vbsrl_v(val, 1)); \ + return (sctype)__lsx_vpickve2gr_b(val, 0); \ + } + +OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu) +OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu) +OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, min, __lsx_vmin_b) +OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, max, __lsx_vmax_b) + +#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec &a) \ + { \ + __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \ + val = intrin(val, __lsx_vbsrl_v(val, 4)); \ + val = intrin(val, __lsx_vbsrl_v(val, 2)); \ + return (sctype)__lsx_vpickve2gr_h(val, 0); \ + } + +OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu) +OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu) +OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, min, __lsx_vmin_h) +OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, max, __lsx_vmax_h) + +#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec &a) \ + { \ + __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \ + val = intrin(val, __lsx_vbsrl_v(val, 4)); \ + return (sctype)__lsx_vpickve2gr_w(val, 0); \ + } + +OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu) +OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu) +OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, min, __lsx_vmin_w) +OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, max, __lsx_vmax_w) + +#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin) \ + inline float v_reduce_##func(const v_float32x4 &a) \ + { \ + __m128 val = a.val; \ + val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8)); \ + val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4)); \ + float *fval = (float*)&val; \ + return fval[0]; \ + } + +OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s) +OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s) + +inline int v_reduce_sum(const v_int32x4 &a) +{ + __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val); + __m128i t2 = __lsx_vhaddw_q_d(t1, t1); + return (int)__lsx_vpickve2gr_w(t2, 0); +} + +inline unsigned v_reduce_sum(const v_uint32x4 &a) +{ + __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val); + __m128i t2 = __lsx_vhaddw_qu_du(t1, t1); + return (int)__lsx_vpickve2gr_w(t2, 0); +} + +inline int v_reduce_sum(const v_int16x8 &a) +{ + __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val); + __m128i t2 = __lsx_vhaddw_d_w(t1, t1); + __m128i t3 = __lsx_vhaddw_q_d(t2, t2); + return (int)__lsx_vpickve2gr_w(t3, 0); +} + +inline unsigned v_reduce_sum(const v_uint16x8 &a) +{ + __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val); + __m128i t2 = __lsx_vhaddw_du_wu(t1, t1); + __m128i t3 = __lsx_vhaddw_qu_du(t2, t2); + return (int)__lsx_vpickve2gr_w(t3, 0); +} + +inline float v_reduce_sum(const v_float32x4 &a) +{ + __m128i val = (__m128i)a.val; + val = __lsx_vbsrl_v(val, 8); + __m128 result = __lsx_vfadd_s(a.val, (__m128)val); + float *pa = (float*)&result; + return (float)(pa[0] + pa[1]); +} + +inline uint64 v_reduce_sum(const v_uint64x2 &a) +{ + __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val); + return (uint64)__lsx_vpickve2gr_du(t0, 0); +} + +inline int64 v_reduce_sum(const v_int64x2 &a) +{ + __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val); + return (int64)__lsx_vpickve2gr_d(t0, 0); +} + +inline double v_reduce_sum(const v_float64x2 &a) +{ + double *pa = (double*)&a; + return pa[0] + pa[1]; +} + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + __m128i a0 = (__m128i)a.val; + __m128i b0 = (__m128i)b.val; + __m128i c0 = (__m128i)c.val; + __m128i d0 = (__m128i)d.val; + __m128i ac_l = __lsx_vilvl_w(c0, a0); + __m128i ac_h = __lsx_vilvh_w(c0, a0); + __m128i bd_l = __lsx_vilvl_w(d0, b0); + __m128i bd_h = __lsx_vilvh_w(d0, b0); + __m128 ac = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h); + __m128 bd = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h); + return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac), + (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac))); +} + +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + __m128i t0 = __lsx_vabsd_b(a.val, b.val); + __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0); + __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1); + __m128i t3 = __lsx_vhaddw_du_wu(t2, t2); + __m128i t4 = __lsx_vhaddw_qu_du(t3, t3); + return (unsigned)__lsx_vpickve2gr_w(t4, 0); +} + +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i t0 = __lsx_vabsd_bu(a.val, b.val); + __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0); + __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1); + __m128i t3 = __lsx_vhaddw_du_wu(t2, t2); + __m128i t4 = __lsx_vhaddw_qu_du(t3, t3); + return (unsigned)__lsx_vpickve2gr_w(t4, 0); +} + +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i t0 = __lsx_vabsd_hu(a.val, b.val); + __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0); + __m128i t2 = __lsx_vhaddw_du_wu(t1, t1); + __m128i t3 = __lsx_vhaddw_qu_du(t2, t2); + return (unsigned)__lsx_vpickve2gr_w(t3, 0); +} + +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + __m128i t0 = __lsx_vabsd_h(a.val, b.val); + __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0); + __m128i t2 = __lsx_vhaddw_du_wu(t1, t1); + __m128i t3 = __lsx_vhaddw_qu_du(t2, t2); + return (unsigned)__lsx_vpickve2gr_w(t3, 0); +} + +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i t0 = __lsx_vabsd_wu(a.val, b.val); + __m128i t1 = __lsx_vhaddw_du_wu(t0, t0); + __m128i t2 = __lsx_vhaddw_qu_du(t1, t1); + return (unsigned)__lsx_vpickve2gr_w(t2, 0); +} + +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + __m128i t0 = __lsx_vabsd_w(a.val, b.val); + __m128i t1 = __lsx_vhaddw_du_wu(t0, t0); + __m128i t2 = __lsx_vhaddw_qu_du(t1, t1); + return (unsigned)__lsx_vpickve2gr_w(t2, 0); +} + +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 a_b = a - b; + return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff))); +} + +/** Popcount **/ +#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_popcount(const _Tp& a) \ +{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); } + +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_uint8x16, b); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_int8x16, b); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_uint16x8, h); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_int16x8, h); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_uint32x4, w); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_int32x4, w); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_uint64x2, d); +OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_int64x2, d); + +/** Mask **/ +#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \ +inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; } +OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(short, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(int, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(float, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64) + +inline int v_signmask(const v_int8x16& a) +{ + __m128i result = __lsx_vmskltz_b(a.val); + return __lsx_vpickve2gr_w(result, 0); +} +inline int v_signmask(const v_uint8x16& a) +{ return v_signmask(v_reinterpret_as_s8(a)) ;} + +inline int v_signmask(const v_int16x8 &a) +{ + __m128i result = __lsx_vmskltz_h(a.val); + return __lsx_vpickve2gr_w(result, 0); +} +inline int v_signmask(const v_uint16x8 &a) +{ return v_signmask(v_reinterpret_as_s16(a)); } + +inline int v_signmask(const v_uint32x4& a) +{ + __m128i result = __lsx_vmskltz_w(a.val); + return __lsx_vpickve2gr_w(result, 0); +} +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } + +inline int v_signmask(const v_uint64x2& a) +{ + __m128i result = __lsx_vmskltz_d(a.val); + return __lsx_vpickve2gr_w(result, 0); +} +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } + +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(*(v_int32x4*)(&a)); } + +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(*(v_int64x2*)(&a)); } + +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } + +/** Checks **/ +#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \ + inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \ + inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; } +OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535) +OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535) +OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255); +OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255); +OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15) +OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15) +OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3) +OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3) +OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15) +OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3) + +///////////// Other math ///////////// + +/** Some frequent operations **/ +#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix) \ + inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c) \ + { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_sqrt(const _Tpvec& x) \ + { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \ + inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_fma(a, a, b * b); } \ + inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_sqrt(v_fma(a, a, b * b)); } + +OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s) +OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d) + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); } + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ return v_fma(a, b, c); } + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + return v_float32x4(__lsx_vfrsqrt_s(x.val)); +} + +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ + return v_float64x2(__lsx_vfrsqrt_d(x.val)); +} + +/** Absolute values **/ +#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix) \ + inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \ + { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); } + +OPENCV_HAL_IMPL_LSX_ABS(int8x16, b) +OPENCV_HAL_IMPL_LSX_ABS(int16x8, h) +OPENCV_HAL_IMPL_LSX_ABS(int32x4, w) + +inline v_float32x4 v_abs(const v_float32x4& x) +{ return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); } +inline v_float64x2 v_abs(const v_float64x2& x) +{ return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); } + +/** Absolute difference **/ + +inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) +{ return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); } +inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) +{ return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); } +inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) +{ return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); } + +inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) +{ return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); } +inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) +{ return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); } +inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) +{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); } + +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ return v_abs(a - b); } + +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ return v_abs(a - b); } + +/** Saturating absolute difference **/ +inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) +{ + v_int8x16 d = a - b; + v_int8x16 m = a < b; + return (d ^ m) - m; +} +inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) +{ return v_max(a, b) - v_min(a, b); } + +///////// Conversions ///////// + +/** Rounding **/ +inline v_int32x4 v_round(const v_float32x4& a) +{ return v_int32x4(__lsx_vftint_w_s(a.val)); } + +inline v_int32x4 v_round(const v_float64x2& a) +{ return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); } + +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); } + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(__lsx_vftintrz_w_s(a.val)); } + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); } + +inline v_int32x4 v_floor(const v_float32x4& a) +{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); } + +inline v_int32x4 v_floor(const v_float64x2& a) +{ return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); } + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); } + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); } + +/** To float **/ +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ return v_float32x4(__lsx_vffint_s_w(a.val)); } + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); } + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); } + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ return v_float64x2(__lsx_vffintl_d_w(a.val)); } + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ return v_float64x2(__lsx_vffinth_d_w(a.val)); } + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ return v_float64x2(__lsx_vfcvtl_d_s(a.val)); } + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ return v_float64x2(__lsx_vfcvth_d_s(a.val)); } + +inline v_float64x2 v_cvt_f64(const v_int64x2& v) +{ return v_float64x2(__lsx_vffint_d_l(v.val)); } + + +//////////////// Lookup table access //////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]], + tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], + tab[idx[14]], tab[idx[15]])); +} + +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), + *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]), + *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))); +} + +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))); +} + +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) +{ return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) +{ return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) +{ return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1]))); +} + +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) +{ return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) +{ return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) +{ return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} +inline v_int32x4 v_lut_pairs(const int *tab, const int* idx) +{ + return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1]))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(__lsx_vld(tab + idx[0], 0)); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int *idx) +{ + return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]])); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(__lsx_vld(tab + idx[0], 0)); +} + +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1]))); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0)); +} + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]])); +} +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0)); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int *idx = (int*)&idxvec.val; + return v_lut(tab, idx); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + const int *idx = (const int*)&idxvec.val; + return v_lut(tab, idx); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + const int *idx = (const int*)&idxvec.val; + return v_lut(tab, idx); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + const int *idx = (const int*)&idxvec.val; + __m128i xy0 = __lsx_vld(tab + idx[0], 0); + __m128i xy1 = __lsx_vld(tab + idx[1], 0); + __m128i xy2 = __lsx_vld(tab + idx[2], 0); + __m128i xy3 = __lsx_vld(tab + idx[3], 0); + __m128i xy01 = __lsx_vilvl_d(xy1, xy0); + __m128i xy23 = __lsx_vilvl_d(xy3, xy2); + __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01); + __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01); + x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02)); + y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + const int* idx = (const int*)&idxvec.val; + __m128i xy0 = __lsx_vld(tab + idx[0], 0); + __m128i xy1 = __lsx_vld(tab + idx[1], 0); + x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0)); + y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0)); +} + +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + return v_int8x16(__lsx_vshuf_b(vec.val, vec.val, + _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08))); +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) +{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + return v_int8x16(__lsx_vshuf_b(vec.val, vec.val, + _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08))); +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) +{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + return v_int16x8(__lsx_vshuf_b(vec.val, vec.val, + _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908))); +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) +{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + return v_int16x8(__lsx_vshuf_b(vec.val, vec.val, + _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504))); +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) +{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8)); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) +{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) +{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + __m128i zero = __lsx_vldi(0); + return v_int8x16(__lsx_vshuf_b(zero, vec.val, + _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100))); +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) +{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + __m128i zero = __lsx_vldi(0); + return v_int16x8(__lsx_vshuf_b(zero, vec.val, + _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100))); +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) +{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + +//////////// Matrix operations ///////// + +/////////// Dot Product ///////// + +// 16 >> 32 +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + __m128i x = a.val, y = b.val; + return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y)); +} +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + __m128i x = a.val, y = b.val, z = c.val; + __m128i t = __lsx_vmaddwev_w_h(z, x, y); + return v_int32x4(__lsx_vmaddwod_w_h(t, x, y)); +} + +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ + __m128i x = a.val, y = b.val; + return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y)); +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + __m128i x = a.val, y = b.val, z = c.val; + __m128i t = __lsx_vmaddwev_d_w(z, x, y); + return v_int64x2(__lsx_vmaddwod_d_w(t, x, y)); +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i x = a.val, y = b.val; + __m128i even = __lsx_vmulwev_h_bu(x, y); + __m128i odd = __lsx_vmulwod_h_bu(x, y); + __m128i prod0 = __lsx_vhaddw_wu_hu(even, even); + __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd); + return v_uint32x4(__lsx_vadd_w(prod0, prod1)); +} + +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b) + c ;} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + __m128i x = a.val, y = b.val; + __m128i even = __lsx_vmulwev_h_b(x, y); + __m128i odd = __lsx_vmulwod_h_b(x, y); + __m128i prod0 = __lsx_vhaddw_w_h(even, even); + __m128i prod1 = __lsx_vhaddw_w_h(odd, odd); + return v_int32x4(__lsx_vadd_w(prod0, prod1)); +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i x = a.val, y = b.val; + __m128i even = __lsx_vmulwev_w_hu(x, y); + __m128i odd = __lsx_vmulwod_w_hu(x, y); + __m128i prod0 = __lsx_vhaddw_du_wu(even, even); + __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd); + return v_uint64x2(__lsx_vadd_d(prod0, prod1)); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + __m128i x = a.val, y = b.val; + __m128i even = __lsx_vmulwev_w_h(x, y); + __m128i odd = __lsx_vmulwod_w_h(x, y); + __m128i prod0 = __lsx_vhaddw_d_w(even, even); + __m128i prod1 = __lsx_vhaddw_d_w(odd, odd); + return v_int64x2(__lsx_vadd_d(prod0, prod1)); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +//32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + + +///////// Fast Dot Product ////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i x = a.val, y = b.val; + __m128i even = __lsx_vmulwev_w_hu(x, y); + __m128i odd = __lsx_vmulwod_w_hu(x, y); + __m128i prod0 = __lsx_vhaddw_du_wu(even, even); + __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd); + return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1))); +} +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + __m128i x = a.val, y = b.val; + __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y); + __m128i sign = __lsx_vsrai_w(prod, 31); + __m128i lo = __lsx_vilvl_w(sign, prod); + __m128i hi = __lsx_vilvh_w(sign, prod); + return v_int64x2(__lsx_vadd_d(lo, hi)); +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) +{ + __m128i x = (__m128i)v.val; + __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val); + __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val); + __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val); + __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val); + + return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3))); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a) +{ + __m128i x = (__m128i)v.val; + __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val); + __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val); + __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val); + + return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2)); +} + +#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to) \ + inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \ + { \ + __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val)); \ + __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val)); \ + __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val)); \ + __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val)); \ + b0.val = cast_to(__lsx_vilvl_d(t1, t0)); \ + b1.val = cast_to(__lsx_vilvh_d(t1, t0)); \ + b2.val = cast_to(__lsx_vilvl_d(t3, t2)); \ + b3.val = cast_to(__lsx_vilvh_d(t3, t2)); \ + } + +OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP) + +inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1, + const v_float32x4& a2, const v_float32x4& a3, + v_float32x4& b0, v_float32x4& b1, v_float32x4& b2, v_float32x4& b3) +{ + __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val; + __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val; + __m128i t0 = __lsx_vilvl_w(vec1, vec0); + __m128i t1 = __lsx_vilvl_w(vec3, vec2); + __m128i t2 = __lsx_vilvh_w(vec1, vec0); + __m128i t3 = __lsx_vilvh_w(vec3, vec2); + b0.val = __m128(__lsx_vilvl_d(t1, t0)); + b1.val = __m128(__lsx_vilvh_d(t1, t0)); + b2.val = __m128(__lsx_vilvl_d(t3, t2)); + b3.val = __m128(__lsx_vilvh_d(t3, t2)); +} + +////////////////// Value reordering //////////////// + +/* Expand */ +#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi) \ + inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ + { \ + b0.val = intrin_lo(a.val, 0); \ + b1.val = intrin_hi(a.val); \ + } \ + inline _Tpwvec v_expand_low(const _Tpvec& a) \ + { return _Tpwvec(intrin_lo(a.val, 0)); } \ + inline _Tpwvec v_expand_high(const _Tpvec& a) \ + { return _Tpwvec(intrin_hi(a.val)); } \ + inline _Tpwvec v_load_expand(const _Tp* ptr) \ + { \ + __m128i a = __lsx_vld(ptr, 0); \ + return _Tpwvec(intrin_lo(a, 0)); \ + } + +OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar, __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu) +OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16, v_int16x8, schar, __lsx_vsllwil_h_b, __lsx_vexth_h_b) +OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort, __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu) +OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8, v_int32x4, short, __lsx_vsllwil_w_h, __lsx_vexth_w_h) +OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned, __lsx_vsllwil_du_wu, __lsx_vexth_du_wu) +OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4, v_int64x2, int, __lsx_vsllwil_d_w, __lsx_vexth_d_w) + +#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi) \ + inline _Tpvec v_load_expand_q(const _Tp* ptr) \ + { \ + __m128i a = __lsx_vld(ptr, 0); \ + __m128i b = intrin_lo(a, 0); \ + return _Tpvec(intrin_hi(b, 0)); \ + } + +OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu) +OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4, schar, __lsx_vsllwil_h_b, __lsx_vsllwil_w_h) + +/* pack */ +// 16 +inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) +{ return v_int8x16(_lsx_packs_h(a.val, b.val)); } + +inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) +{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); } + +inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) +{ return v_uint8x16(_lsx_packus_h(a.val, b.val)); } + +inline void v_pack_store(schar* ptr, const v_int16x8& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(uchar* ptr, const v_uint16x8& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + +template inline +v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) +{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) +{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); } + +template inline +v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) +{ return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); } + +template inline +void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) +{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); } + +template inline +v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) +{ return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(schar* ptr, const v_int16x8& a) +{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); } + +//32 +inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) +{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); } + +inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) +{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); } + +inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) +{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); } + +inline void v_pack_store(short* ptr, const v_int32x4& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(ushort *ptr, const v_uint32x4& a) +{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr, 0, 0); } + +inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) +{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); } + +template inline +v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) +{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) +{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); } + +template inline +v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) +{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); } + +template inline +void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) +{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); } + +template inline +v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) +{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(short* ptr, const v_int32x4& a) +{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); } + +// 64 +// Non-saturaing pack +inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) +{ return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); } + +inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) +{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } + +inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) +{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); } + +inline void v_pack_store(int *ptr, const v_int64x2& a) +{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); } + +template inline +v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) +{ return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) +{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); } + +template inline +v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) +{ return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); } + +template inline +void v_rshr_pack_store(int* ptr, const v_int64x2& a) +{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); } + +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); } + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0); + __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0); + return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0); + __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0); + __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0); + __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0); + + __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0); + __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0); + return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0)); +} + +/* Recombine */ +// its up there with load and store operations + +/* Extract */ +#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec) \ + template \ + inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ + { return v_rotate_right(a, b); } + +OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4) +OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2) + +#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin) \ +template \ +inline _Twvec v_extract_n(const _Tpvec& a) \ +{ return (_Twvec)intrin(a.val, i); } + +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar, __lsx_vpickve2gr_b) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16, schar, __lsx_vpickve2gr_b) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort, __lsx_vpickve2gr_h) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8, short, __lsx_vpickve2gr_h) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint, __lsx_vpickve2gr_w) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4, int, __lsx_vpickve2gr_w) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64, __lsx_vpickve2gr_d) +OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2, int64, __lsx_vpickve2gr_d) + +template +inline float v_extract_n(const v_float32x4& v) +{ + union { uint iv; float fv; } d; + d.iv = __lsx_vpickve2gr_w(v.val, i); + return d.fv; +} + +template +inline double v_extract_n(const v_float64x2& v) +{ + union { uint64 iv; double dv; } d; + d.iv = __lsx_vpickve2gr_d(v.val, i); + return d.dv; +} + +template +inline v_uint32x4 v_broadcast_element(const v_uint32x4& a) +{ return v_uint32x4(__lsx_vreplvei_w(a.val, i)); } + +template +inline v_int32x4 v_broadcast_element(const v_int32x4& a) +{ return v_int32x4(__lsx_vreplvei_w(a.val, i)); } + +template +inline v_float32x4 v_broadcast_element(const v_float32x4& a) +{ return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); } + +/////////////////// load deinterleave ////////////////////////////// + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + + a.val = __lsx_vpickev_b(t1, t0); + b.val = __lsx_vpickod_b(t1, t0); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + a.val = __lsx_vpickev_h(t1, t0); + b.val = __lsx_vpickod_h(t1, t0); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + a.val = __lsx_vpickev_w(t1, t0); + b.val = __lsx_vpickod_w(t1, t0); +} + +inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + a.val = __lsx_vilvl_d(t1, t0); + b.val = __lsx_vilvh_d(t1, t0); +} + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0); + __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1); + __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0); + const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29); + const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30); + const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31); + + a.val = __lsx_vshuf_b(t2, a0, shuff_a); + b.val = __lsx_vshuf_b(t2, b0, shuff_b); + c.val = __lsx_vshuf_b(t2, c0, shuff_c); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0); + const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1); + + __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1); + __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0); + __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0); + + const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27); + const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29); + const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31); + + a.val = __lsx_vshuf_b(t2, a0, shuff_a); + b.val = __lsx_vshuf_b(t2, b0, shuff_b); + c.val = __lsx_vshuf_b(t2, c0, shuff_c); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + + __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC); + __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5); + __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A); + + a.val = __lsx_vextrins_w(a0, t2, 0x31); + b0 = __lsx_vshuf4i_w(b0, 0x38); + c0 = __lsx_vshuf4i_w(c0, 0x8); + b.val = __lsx_vextrins_w(b0, t2, 0x32); + c.val = __lsx_vpermi_w(t2, c0, 0xC4); +} + +inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + + a.val = __lsx_vshuf4i_d(t0, t1, 0xC); + b.val = __lsx_vshuf4i_d(t0, t2, 0x9); + c.val = __lsx_vshuf4i_d(t1, t2, 0xC); +} + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + __m128i t3 = __lsx_vld(ptr, 48); + + __m128i ac_lo = __lsx_vpickev_b(t1, t0); + __m128i bd_lo = __lsx_vpickod_b(t1, t0); + __m128i ac_hi = __lsx_vpickev_b(t3, t2); + __m128i bd_hi = __lsx_vpickod_b(t3, t2); + + a.val = __lsx_vpickev_b(ac_hi, ac_lo); + c.val = __lsx_vpickod_b(ac_hi, ac_lo); + b.val = __lsx_vpickev_b(bd_hi, bd_lo); + d.val = __lsx_vpickod_b(bd_hi, bd_lo); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + __m128i t3 = __lsx_vld(ptr, 48); + + __m128i ac_lo = __lsx_vpickev_h(t1, t0); + __m128i bd_lo = __lsx_vpickod_h(t1, t0); + __m128i ac_hi = __lsx_vpickev_h(t3, t2); + __m128i bd_hi = __lsx_vpickod_h(t3, t2); + + a.val = __lsx_vpickev_h(ac_hi, ac_lo); + c.val = __lsx_vpickod_h(ac_hi, ac_lo); + b.val = __lsx_vpickev_h(bd_hi, bd_lo); + d.val = __lsx_vpickod_h(bd_hi, bd_lo); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) +{ + __m128i p0 = __lsx_vld(ptr, 0); + __m128i p1 = __lsx_vld(ptr, 16); + __m128i p2 = __lsx_vld(ptr, 32); + __m128i p3 = __lsx_vld(ptr, 48); + + __m128i t0 = __lsx_vilvl_w(p1, p0); + __m128i t1 = __lsx_vilvl_w(p3, p2); + __m128i t2 = __lsx_vilvh_w(p1, p0); + __m128i t3 = __lsx_vilvh_w(p3, p2); + a.val = __lsx_vilvl_d(t1, t0); + b.val = __lsx_vilvh_d(t1, t0); + c.val = __lsx_vilvl_d(t3, t2); + d.val = __lsx_vilvh_d(t3, t2); +} + +inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d) +{ + __m128i t0 = __lsx_vld(ptr, 0); + __m128i t1 = __lsx_vld(ptr, 16); + __m128i t2 = __lsx_vld(ptr, 32); + __m128i t3 = __lsx_vld(ptr, 48); + + a.val = __lsx_vilvl_d(t2, t0); + b.val = __lsx_vilvh_d(t2, t0); + c.val = __lsx_vilvl_d(t3, t1); + d.val = __lsx_vilvh_d(t3, t1); +} + +////////////////////////// store interleave //////////////////////////////// + +inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i v0 = __lsx_vilvl_b(b.val, a.val); + __m128i v1 = __lsx_vilvh_b(b.val, a.val); + + __lsx_vst(v0, ptr, 0); + __lsx_vst(v1, ptr, 16); +} + +inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i v0 = __lsx_vilvl_h(b.val, a.val); + __m128i v1 = __lsx_vilvh_h(b.val, a.val); + + __lsx_vst(v0, ptr, 0); + __lsx_vst(v1, ptr, 16); +} + +inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i v0 = __lsx_vilvl_w(b.val, a.val); + __m128i v1 = __lsx_vilvh_w(b.val, a.val); + + __lsx_vst(v0, ptr, 0); + __lsx_vst(v1, ptr, 16); +} + +inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i v0 = __lsx_vilvl_d(b.val, a.val); + __m128i v1 = __lsx_vilvh_d(b.val, a.val); + + __lsx_vst(v0, ptr, 0); + __lsx_vst(v1, ptr, 16); +} + +inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i ab_lo = __lsx_vilvl_b(b.val, a.val); + __m128i ab_hi = __lsx_vilvh_b(b.val, a.val); + __m128i v_c = c.val; + const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10); + const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21); + const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31); + __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4); + + __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0); + __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1); + __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3); + dst1 = __lsx_vshuf_b(abc, dst1, shuff2); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); +} + +inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i ab_lo = __lsx_vilvl_h(b.val, a.val); + __m128i ab_hi = __lsx_vilvh_h(b.val, a.val); + __m128i v_c = c.val; + const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11); + const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21); + const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31); + __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4); + + __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0); + __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1); + __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3); + dst1 = __lsx_vshuf_b(abc, dst1, shuff2); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); +} + +inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i v_c = c.val; + __m128i ab_lo = __lsx_vilvl_w(b.val, a.val); //a0 b0 a1 b1 + __m128i ab_hi = __lsx_vilvh_w(b.val, a.val); //a2 b2 a3 b3 + __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3 + + __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4); //a0 b0 b1 a1 + __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2 + __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3 + + dst0 = __lsx_vextrins_w(dst0, v_c, 0x20); + dst2 = __lsx_vextrins_w(dst2, v_c, 0x2); + __lsx_vst(dst0, ptr, 0); //a0 b0 c0 a1 + __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2 + __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3 +} + +inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i dst0 = __lsx_vilvl_d(b.val, a.val); + __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4); + __m128i dst2 = __lsx_vilvh_d(c.val, b.val); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); +} + +inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c, const v_uint8x16& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i ab_lo = __lsx_vilvl_b(b.val, a.val); + __m128i ab_hi = __lsx_vilvh_b(b.val, a.val); + __m128i cd_lo = __lsx_vilvl_b(d.val, c.val); + __m128i cd_hi = __lsx_vilvh_b(d.val, c.val); + + __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo); + __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo); + __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi); + __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); + __lsx_vst(dst3, ptr, 48); +} + +inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + const v_uint16x8& c, const v_uint16x8& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i ab_lo = __lsx_vilvl_h(b.val, a.val); + __m128i ab_hi = __lsx_vilvh_h(b.val, a.val); + __m128i cd_lo = __lsx_vilvl_h(d.val, c.val); + __m128i cd_hi = __lsx_vilvh_h(d.val, c.val); + + __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo); + __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo); + __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi); + __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); + __lsx_vst(dst3, ptr, 48); +} + +inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i ab_lo = __lsx_vilvl_w(b.val, a.val); + __m128i ab_hi = __lsx_vilvh_w(b.val, a.val); + __m128i cd_lo = __lsx_vilvl_w(d.val, c.val); + __m128i cd_hi = __lsx_vilvh_w(d.val, c.val); + + __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo); + __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo); + __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi); + __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); + __lsx_vst(dst3, ptr, 48); +} + +inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, + const v_uint64x2& c, const v_uint64x2& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + __m128i dst0 = __lsx_vilvl_d(b.val, a.val); + __m128i dst2 = __lsx_vilvh_d(b.val, a.val); + __m128i dst1 = __lsx_vilvl_d(d.val, c.val); + __m128i dst3 = __lsx_vilvh_d(d.val, c.val); + + __lsx_vst(dst0, ptr, 0); + __lsx_vst(dst1, ptr, 16); + __lsx_vst(dst2, ptr, 32); + __lsx_vst(dst3, ptr, 48); +} + +#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, \ + _Tpvec0& c0, _Tpvec0& d0) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1); \ +} \ +inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ +} \ +inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ +} + +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64) + +// +// FP16 +// + +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ +#if CV_FP16 + return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0))); +#else + float CV_DECL_ALIGNED(32) buf[4]; + for (int i = 0; i < 4; i++) + buf[i] = (float)ptr[i]; + return v_float32x4((__m128)__lsx_vld(buf, 0)); +#endif +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& a) +{ +#if CV_FP16 + __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val); + __lsx_vstelm_d(res, ptr, 0, 0); +#else + float CV_DECL_ALIGNED(32) buf[4]; + v_store_aligned(buf, a); + for (int i = 0; i < 4; i++) + ptr[i] = float16_t(buf[i]); +#endif +} + +// +// end of FP16 +// + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: + +#endif // OPENCV_HAL_INTRIN_LSX_HPP diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 8b9f47dc2d..8d651b8c8d 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -427,6 +427,7 @@ struct HWFeatures g_hwFeatureNames[CPU_RVV] = "RVV"; + g_hwFeatureNames[CPU_LSX] = "LSX"; g_hwFeatureNames[CPU_LASX] = "LASX"; } @@ -703,6 +704,10 @@ struct HWFeatures have[CV_CPU_RVV] = true; #endif + #if defined __loongarch_sx + have[CV_CPU_LSX] = true; + #endif + #if defined __loongarch_asx have[CV_CPU_LASX] = true; #endif