Merge pull request #23929 from CNClareChen:4.x

* Optimize some function with lasx.

Optimize some function with lasx. #23929

This patch optimizes some lasx functions and reduces the runtime of opencv_test_core from 662,238ms to 633603ms on the 3A5000 platform.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/24438/head
CNClareChen 1 year ago committed by GitHub
parent 996b6c37c7
commit d142a796d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 11
      cmake/OpenCVCompilerOptimizations.cmake
  2. 15
      cmake/checks/cpu_lsx.cpp
  3. 9
      modules/core/include/opencv2/core/cv_cpu_dispatch.h
  4. 21
      modules/core/include/opencv2/core/cv_cpu_helper.h
  5. 6
      modules/core/include/opencv2/core/cvdef.h
  6. 6
      modules/core/include/opencv2/core/hal/intrin.hpp
  7. 605
      modules/core/include/opencv2/core/hal/intrin_lasx.hpp
  8. 2536
      modules/core/include/opencv2/core/hal/intrin_lsx.hpp
  9. 5
      modules/core/src/system.cpp

@ -53,6 +53,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
list(APPEND CPU_ALL_OPTIMIZATIONS LSX)
list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@ -397,10 +398,16 @@ elseif(RISCV)
set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(LOONGARCH64)
ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp")
ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
ocv_update(CPU_LSX_FLAGS_ON "-mlsx")
ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX")
set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
else()
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
endif()

@ -0,0 +1,15 @@
#include <stdio.h>
#include <lsxintrin.h>
int test()
{
const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f};
v4f32 val = (v4f32)__lsx_vld((const float*)(src), 0);
return __lsx_vpickve2gr_w(__lsx_vftint_w_s(val), 3);
}
int main()
{
printf("%d\n", test());
return 0;
}

@ -172,6 +172,11 @@
# define CV_MSA 1
#endif
#ifdef CV_CPU_COMPILE_LSX
# include <lsxintrin.h>
# define CV_LSX 1
#endif
#ifdef CV_CPU_COMPILE_LASX
# include <lasxintrin.h>
# define CV_LASX 1
@ -376,6 +381,10 @@ struct VZeroUpperGuard {
# define CV_RVV 0
#endif
#ifndef CV_LSX
# define CV_LSX 0
#endif
#ifndef CV_LASX
# define CV_LASX 0
#endif

@ -525,6 +525,27 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
# define CV_TRY_LSX 1
# define CV_CPU_FORCE_LSX 1
# define CV_CPU_HAS_SUPPORT_LSX 1
# define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX
# define CV_TRY_LSX 1
# define CV_CPU_FORCE_LSX 0
# define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX))
# define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
# define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
#else
# define CV_TRY_LSX 0
# define CV_CPU_FORCE_LSX 0
# define CV_CPU_HAS_SUPPORT_LSX 0
# define CV_CPU_CALL_LSX(fn, args)
# define CV_CPU_CALL_LSX_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...) CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
# define CV_TRY_LASX 1
# define CV_CPU_FORCE_LASX 1

@ -281,7 +281,8 @@ namespace cv {
#define CV_CPU_RVV 210
#define CV_CPU_LASX 230
#define CV_CPU_LSX 230
#define CV_CPU_LASX 231
// CPU features groups
#define CV_CPU_AVX512_SKX 256
@ -342,7 +343,8 @@ enum CpuFeatures {
CPU_RVV = 210,
CPU_LASX = 230,
CPU_LSX = 230,
CPU_LASX = 231,
CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512

@ -206,7 +206,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_RVV
#endif
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
@ -242,6 +242,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#include "opencv2/core/hal/intrin_rvv.hpp"
#endif
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_lsx.hpp"
#elif CV_LASX
#if !defined(CV_FORCE_SIMD128_CPP)
#define CV_FORCE_SIMD128_CPP 1

@ -96,54 +96,22 @@ inline __m256d _v256_setall_pd(double f64)
inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
{
__m256i u8min = __lasx_xvreplgr2vr_h(0);
__m256i u8max = __lasx_xvreplgr2vr_h(255);
__m256i sat_a = __lasx_xvmax_h(a, u8min);
sat_a = __lasx_xvmin_h(sat_a, u8max);
__m256i sat_b = __lasx_xvmax_h(b, u8min);
sat_b = __lasx_xvmin_h(sat_b, u8max);
__m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
return __lasx_xvssrarni_bu_h(b, a, 0);
}
inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
{
__m256i s8min = __lasx_xvreplgr2vr_h(-128);
__m256i s8max = __lasx_xvreplgr2vr_h(127);
__m256i sat_a = __lasx_xvmax_h(a, s8min);
sat_a = __lasx_xvmin_h(sat_a, s8max);
__m256i sat_b = __lasx_xvmax_h(b, s8min);
sat_b = __lasx_xvmin_h(sat_b, s8max);
__m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
return __lasx_xvssrarni_b_h(b, a, 0);
}
inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
{
__m256i u16min = __lasx_xvreplgr2vr_w(0);
__m256i u16max = __lasx_xvreplgr2vr_w(0xffff);
__m256i sat_a = __lasx_xvmax_w(a, u16min);
sat_a = __lasx_xvmin_w(sat_a, u16max);
__m256i sat_b = __lasx_xvmax_w(b, u16min);
sat_b = __lasx_xvmin_w(sat_b, u16max);
__m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14);
return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
return __lasx_xvssrarni_hu_w(b, a, 0);
}
inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
{
__m256i s16min = __lasx_xvreplgr2vr_w(-0x8000);
__m256i s16max = __lasx_xvreplgr2vr_w(0x7fff);
__m256i sat_a = __lasx_xvmax_w(a, s16min);
sat_a = __lasx_xvmin_w(sat_a, s16max);
__m256i sat_b = __lasx_xvmax_w(b, s16min);
sat_b = __lasx_xvmin_w(sat_b, s16max);
__m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14);
return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
return __lasx_xvssrarni_h_w(b, a, 0);
}
inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
@ -191,7 +159,7 @@ inline _Tpvec v256_permute4x64(const _Tpvec& a)
{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
inline __m128i _v256_extract_high(const __m256i& v)
{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31);
{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
return *((__m128i*)&temp256i); }
inline __m128 _v256_extract_high(const __m256& v)
@ -211,10 +179,7 @@ inline __m128d _v256_extract_low(const __m256d& v)
inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
{
const __m256i maxv = __lasx_xvreplgr2vr_w(65535);
__m256i am = __lasx_xvmin_wu(a, maxv);
__m256i bm = __lasx_xvmin_wu(b, maxv);
return _lasx_packus_w(am, bm);
return __lasx_xvssrlrni_hu_w(b, a, 0);
}
template<int i>
@ -869,14 +834,11 @@ OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h)
inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i ad = __lasx_xvsrai_h(a.val, 8);
__m256i bd = __lasx_xvsrai_h(b.val, 8);
__m256i p0 = __lasx_xvmul_h(a.val, b.val);
__m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8);
const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00);
return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01));
__m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
__m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
return v_uint8x32(__lasx_xvpackev_b(p1, p0));
}
inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
{
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
@ -963,14 +925,7 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w)
inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift)
{
__m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63);
__m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift);
return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift));
}
OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, _v256_srai_dx)
OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
/** Bitwise logic **/
@ -979,7 +934,7 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, _v256_srai_dx)
OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); }
{ return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1))
@ -1224,11 +1179,9 @@ inline v_int8x32 v_reverse(const v_int8x32 &a)
inline v_uint16x16 v_reverse(const v_uint16x16 &a)
{
static const __m256i perm = _v256_setr_b(
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1));
__m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
vec = __lasx_xvshuf4i_w(vec, 0x4E);
return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
}
inline v_int16x16 v_reverse(const v_int16x16 &a)
@ -1236,8 +1189,8 @@ inline v_int16x16 v_reverse(const v_int16x16 &a)
inline v_uint32x8 v_reverse(const v_uint32x8 &a)
{
static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0);
return v_uint32x8(__lasx_xvperm_w(a.val, perm));
__m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
}
inline v_int32x8 v_reverse(const v_int32x8 &a)
@ -1266,17 +1219,19 @@ inline unsigned v_reduce_sum(const v_uint8x32& a)
__m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
__m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
__m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
__m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
}
inline int v_reduce_sum(const v_int8x32& a)
{
__m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
__m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
__m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]);
__m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
}
#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
@ -1344,7 +1299,8 @@ OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
inline int v_reduce_sum(const v_int32x8& a)
{
__m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]);
__m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
}
inline unsigned v_reduce_sum(const v_uint32x8& a)
@ -1367,13 +1323,13 @@ inline float v_reduce_sum(const v_float32x8& a)
inline uint64 v_reduce_sum(const v_uint64x4& a)
{
uint64 *pa = (uint64*)&a;
return pa[0] + pa[1] + pa[2] + pa[3];
__m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
}
inline int64 v_reduce_sum(const v_int64x4& a)
{
int64 *pa = (int64*)&a;
return pa[0] + pa[1] + pa[2] + pa[3];
__m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
}
inline double v_reduce_sum(const v_float64x4& a)
{
@ -1406,7 +1362,8 @@ inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
__m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
__m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
__m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
__m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
}
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
{
@ -1414,7 +1371,8 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
__m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
__m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
__m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
__m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
}
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
@ -1445,36 +1403,13 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
/** Popcount **/
inline v_uint8x32 v_popcount(const v_uint8x32& a)
{
__m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F);
return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)),
__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask))));
}
{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
inline v_uint16x16 v_popcount(const v_uint16x16& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff));
}
{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
inline v_uint32x8 v_popcount(const v_uint32x8& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff));
}
{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a));
uint8_t *pa = (uint8_t*)&atemp;
uint64 v[4];
for (int i = 0; i < 4; ++i) {
v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7];
}
return v_uint64x4(v[0], v[1], v[2], v[3]);
}
{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
inline v_uint8x32 v_popcount(const v_int8x32& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x16 v_popcount(const v_int16x16& a)
@ -1500,10 +1435,9 @@ OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
inline int v_signmask(const v_int8x32& a)
{
int mask = 0;
int8_t *pa = (int8_t*)&a;
for( int i = 0; i < 32; i++ )
mask |= (reinterpret_int(pa[i]) < 0) << i;
__m256i result = __lasx_xvmskltz_b(a.val);
int mask = __lasx_xvpickve2gr_w(result, 0);
mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
return mask;
}
inline int v_signmask(const v_uint8x32& a)
@ -1516,10 +1450,9 @@ inline int v_signmask(const v_uint16x16& a)
inline int v_signmask(const v_int32x8& a)
{
int mask = 0;
int *pa = (int*)&a;
for( int i = 0; i < 8; i++ )
mask |= (pa[i] < 0) << i;
__m256i result = __lasx_xvmskltz_w(a.val);
int mask = __lasx_xvpickve2gr_w(result, 0);
mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
return mask;
}
inline int v_signmask(const v_uint32x8& a)
@ -1527,10 +1460,9 @@ inline int v_signmask(const v_uint32x8& a)
inline int v_signmask(const v_int64x4& a)
{
int mask = 0;
int64 *pa = (int64*)&a;
for( int i = 0; i < 4; i++ )
mask |= (pa[i] < 0) << i;
__m256i result = __lasx_xvmskltz_d(a.val);
int mask = __lasx_xvpickve2gr_d(result, 0);
mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
return mask;
}
inline int v_signmask(const v_uint64x4& a)
@ -1592,7 +1524,7 @@ OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
{
return a * b + c;
return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
}
inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@ -1601,17 +1533,10 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
}
inline v_float32x8 v_invsqrt(const v_float32x8& x)
{
v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
v_float32x8 t = v_float32x8(__lasx_xvfrsqrt_s(x.val));
t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half);
return t;
}
{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
inline v_float64x4 v_invsqrt(const v_float64x4& x)
{
return v_float64x4(1., 1., 1., 1.) / v_sqrt(x);
}
{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
/** Absolute values **/
#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \
@ -1629,28 +1554,18 @@ inline v_float64x4 v_abs(const v_float64x4& x)
/** Absolute difference **/
inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
{ return v_add_wrap(a - b, b - a); }
{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
{ return v_add_wrap(a - b, b - a); }
{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
{ return v_max(a, b) - v_min(a, b); }
{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = v_sub_wrap(a, b);
v_int8x32 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}
{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 d = a - b;
v_int32x8 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
}
{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
{ return v_abs(a - b); }
@ -1740,28 +1655,8 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
}
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
inline v_float64x4 v_cvt_f64(const v_int64x4& v)
{
// constants encoded as floating-point
__m256i magic_i_lo = __lasx_xvreplgr2vr_d(0x4330000000000000);
__m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000);
__m256i magic_i_all = __lasx_xvreplgr2vr_d(0x4530000080100000);
__m256d magic_d_all = _lasx_256_castsi256_pd(magic_i_all);
// Blend the 32 lowest significant bits of v with magic_int_lo
__m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1);
__m256i v_lo = __lasx_xvbitsel_v(magic_i_lo, v.val, mask);
// Extract the 32 most significant bits of v
__m256i v_hi = __lasx_xvsrli_d(v.val, 32);
// Flip the msb of v_hi and blend with 0x45300000
v_hi = __lasx_xvxor_v(v_hi, magic_i_hi32);
// Compute in double precision
__m256d v_hi_dbl = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all);
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
__m256d result = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo));
return v_float64x4(result);
}
{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }
////////////// Lookup table access ////////////////////
@ -1967,11 +1862,9 @@ inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
{
__m256i vzero = __lasx_xvreplgr2vr_w(0);
__m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
_v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100));
__m256i t2 = __lasx_xvshuf_b(vzero, t1,
_v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100));
return v_int8x32(__lasx_xvperm_w(t2,
__m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
_v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
return v_int8x32(__lasx_xvperm_w(t1,
_v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
@ -1980,11 +1873,9 @@ inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
{
__m256i vzero = __lasx_xvreplgr2vr_w(0);
__m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
_v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100));
__m256i t2 = __lasx_xvshuf_b(vzero, t1,
_v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100));
return v_int16x16(__lasx_xvperm_w(t2,
__m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
_v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
return v_int16x16(__lasx_xvperm_w(t1,
_v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
@ -2018,24 +1909,21 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
{
__m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
__m256i odd = __lasx_xvmulwod_d_w(a.val, b.val);
return v_int64x4(__lasx_xvadd_d(even, odd));
return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
}
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
{ return v_dotprod(a, b) + c; }
{
__m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
}
// 8 >> 32
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00);
__m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m);
__m256i odd_a = __lasx_xvsrli_h(a.val, 8);
__m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m);
__m256i odd_b = __lasx_xvsrli_h(b.val, 8);
__m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
__m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
__m256i even = __lasx_xvmulwev_h_bu(a.val, b.val);
__m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val);
__m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
__m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
@ -2043,14 +1931,10 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, con
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
__m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8);
__m256i odd_a = __lasx_xvsrai_h(a.val, 8);
__m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8);
__m256i odd_b = __lasx_xvsrai_h(b.val, 8);
__m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
__m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
__m256i even = __lasx_xvmulwev_h_b(a.val, b.val);
__m256i odd = __lasx_xvmulwod_h_b(a.val, b.val);
__m256i prod0 = __lasx_xvhaddw_w_h(even, even);
__m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
return v_int32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
@ -2059,36 +1943,24 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i mullo = __lasx_xvmul_h(a.val, b.val);
__m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
__m256i mul0 = __lasx_xvilvl_h(mulhi, mullo);
__m256i mul1 = __lasx_xvilvh_h(mulhi, mullo);
__m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
__m256i p13 = __lasx_xvsrli_d(mul0, 32);
__m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
__m256i p57 = __lasx_xvsrli_d(mul1, 32);
__m256i p15_ = __lasx_xvadd_d(p02, p13);
__m256i p9d_ = __lasx_xvadd_d(p46, p57);
return v_uint64x4(__lasx_xvadd_d(
__lasx_xvilvl_d(p9d_, p15_),
__lasx_xvilvh_d(p9d_, p15_)));
__m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
__m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
__m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
__m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
__m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
__m256i sign = __lasx_xvsrai_w(prod, 31);
__m256i lo = __lasx_xvilvl_w(sign, prod);
__m256i hi = __lasx_xvilvh_w(sign, prod);
return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo)));
__m256i even = __lasx_xvmulwev_w_h(a.val, b.val);
__m256i odd = __lasx_xvmulwod_w_h(a.val, b.val);
__m256i prod0 = __lasx_xvhaddw_d_w(even, even);
__m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
return v_int64x4(__lasx_xvadd_d(prod0, prod1));
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand(a, b) + c; }
@ -2126,20 +1998,11 @@ inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, c
// 16 >> 64
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i mullo = __lasx_xvmul_h(a.val, b.val);
__m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
__m256i mul0 = __lasx_xvilvl_h(mulhi, mullo);
__m256i mul1 = __lasx_xvilvh_h(mulhi, mullo);
__m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
__m256i p13 = __lasx_xvsrli_d(mul0, 32);
__m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
__m256i p57 = __lasx_xvsrli_d(mul1, 32);
__m256i p15_ = __lasx_xvadd_d(p02, p13);
__m256i p9d_ = __lasx_xvadd_d(p46, p57);
return v_uint64x4(__lasx_xvadd_d(p15_, p9d_));
__m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
__m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
__m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
__m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
@ -2261,12 +2124,7 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i t = __lasx_xvreplgr2vr_h(255);
__m256i a1 = __lasx_xvmin_hu(a.val, t);
__m256i b1 = __lasx_xvmin_hu(b.val, t);
return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1)));
}
{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
{
@ -2276,13 +2134,8 @@ inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
inline void v_pack_store(schar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
{
const __m256i m = __lasx_xvreplgr2vr_h(255);
__m256i am = __lasx_xvmin_hu(a.val, m);
am = _v256_shuffle_odd_64(_lasx_packus_h(am, am));
v_store_low(ptr, v_uint8x32(am));
}
inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack_u(a, a)); }
@ -2290,45 +2143,43 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
template<int n> inline
v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
v_reinterpret_as_s16((b + delta) >> n));
__m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
return v_uint8x32(_v256_shuffle_odd_64(res));
}
template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
{
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
__m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
template<int n> inline
v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
__m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
return v_uint8x32(_v256_shuffle_odd_64(res));
}
template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_u_store(ptr, (a + delta) >> n);
__m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
template<int n> inline
v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack((a + delta) >> n, (b + delta) >> n);
__m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
return v_int8x32(_v256_shuffle_odd_64(res));
}
template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_store(ptr, (a + delta) >> n);
__m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
// 32
@ -2346,67 +2197,51 @@ inline void v_pack_store(short* ptr, const v_int32x8& a)
inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
{
const __m256i m = __lasx_xvreplgr2vr_w(65535);
__m256i am = __lasx_xvmin_wu(a.val, m);
am = _v256_shuffle_odd_64(_lasx_packus_w(am, am));
v_store_low(ptr, v_uint16x16(am));
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
{ v_store_low(ptr, v_pack_u(a, a)); }
template<int n> inline
v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
{
// we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
v_reinterpret_as_s32((b + delta) >> n));
}
{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
{
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
template<int n> inline
v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
}
{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_u_store(ptr, (a + delta) >> n);
__m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
template<int n> inline
v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
__m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
}
// 64
// Non-saturating pack
inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
{
__m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
__m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08);
__m256i ab = __lasx_xvilvl_d(b0, a0);
__m256i ab = __lasx_xvpickev_w(b.val, a.val);
return v_uint32x8(_v256_shuffle_odd_64(ab));
}
@ -2424,31 +2259,19 @@ inline void v_pack_store(int* ptr, const v_int64x4& b)
template<int n> inline
v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
}
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); }
template<int n> inline
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
}
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); }
// pack boolean
inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
@ -2583,63 +2406,48 @@ template<int i>
inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
///////////////////// load deinterleave /////////////////////////////
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
{
__m256i ab0 = __lasx_xvld(ptr, 0);
__m256i ab1 = __lasx_xvld(ptr + 32, 0);
const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
__m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
__m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
__m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
__m256i a0 = __lasx_xvilvl_d(ph, pl);
__m256i b0 = __lasx_xvilvh_d(ph, pl);
a = v_uint8x32(a0);
b = v_uint8x32(b0);
__m256i t0 = __lasx_xvld(ptr, 0);
__m256i t1 = __lasx_xvld(ptr, 32);
__m256i p0 = __lasx_xvpickev_b(t1, t0);
__m256i p1 = __lasx_xvpickod_b(t1, t0);
a.val = __lasx_xvpermi_d(p0, 0xd8);
b.val = __lasx_xvpermi_d(p1, 0xd8);
}
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
{
__m256i ab0 = __lasx_xvld(ptr, 0);
__m256i ab1 = __lasx_xvld(ptr + 16, 0);
const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
__m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
__m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
__m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
__m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
__m256i a0 = __lasx_xvilvl_d(ph, pl);
__m256i b0 = __lasx_xvilvh_d(ph, pl);
a = v_uint16x16(a0);
b = v_uint16x16(b0);
__m256i t0 = __lasx_xvld(ptr, 0);
__m256i t1 = __lasx_xvld(ptr, 32);
__m256i p0 = __lasx_xvpickev_h(t1, t0);
__m256i p1 = __lasx_xvpickod_h(t1, t0);
a.val = __lasx_xvpermi_d(p0, 0xd8);
b.val = __lasx_xvpermi_d(p1, 0xd8);
}
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
{
__m256i ab0 = __lasx_xvld(ptr, 0);
__m256i ab1 = __lasx_xvld(ptr + 8, 0);
__m256i t0 = __lasx_xvld(ptr, 0);
__m256i t1 = __lasx_xvld(ptr, 32);
//const int sh = 0+2*4+1*16+3*64;
__m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8);
__m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8);
__m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
__m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
__m256i a0 = __lasx_xvilvl_d(ph, pl);
__m256i b0 = __lasx_xvilvh_d(ph, pl);
a = v_uint32x8(a0);
b = v_uint32x8(b0);
__m256i p0 = __lasx_xvpickev_w(t1, t0);
__m256i p1 = __lasx_xvpickod_w(t1, t0);
a.val = __lasx_xvpermi_d(p0, 0xd8);
b.val = __lasx_xvpermi_d(p1, 0xd8);
}
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
{
__m256i ab0 = __lasx_xvld(ptr, 0);
__m256i ab1 = __lasx_xvld(ptr + 4, 0);
__m256i ab1 = __lasx_xvld(ptr, 32);
__m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
__m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
@ -2652,8 +2460,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 32, 0);
__m256i bgr2 = __lasx_xvld(ptr + 64, 0);
__m256i bgr1 = __lasx_xvld(ptr, 32);
__m256i bgr2 = __lasx_xvld(ptr, 64);
__m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
__m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2686,8 +2494,8 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b,
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 16, 0);
__m256i bgr2 = __lasx_xvld(ptr + 32, 0);
__m256i bgr1 = __lasx_xvld(ptr, 32);
__m256i bgr2 = __lasx_xvld(ptr, 64);
__m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
__m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2717,8 +2525,8 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 8, 0);
__m256i bgr2 = __lasx_xvld(ptr + 16, 0);
__m256i bgr1 = __lasx_xvld(ptr, 32);
__m256i bgr2 = __lasx_xvld(ptr, 64);
__m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
__m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2741,8 +2549,8 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 4, 0);
__m256i bgr2 = __lasx_xvld(ptr + 8, 0);
__m256i bgr1 = __lasx_xvld(ptr, 32);
__m256i bgr2 = __lasx_xvld(ptr, 64);
__m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
__m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
@ -2756,81 +2564,60 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b
c = v_uint64x4(r0);
}
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 32, 0);
__m256i bgr2 = __lasx_xvld(ptr + 64, 0);
__m256i bgr3 = __lasx_xvld(ptr + 96, 0);
const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
__m256i t0 = __lasx_xvld(ptr, 0);
__m256i t1 = __lasx_xvld(ptr, 32);
__m256i t2 = __lasx_xvld(ptr, 64);
__m256i t3 = __lasx_xvld(ptr, 96);
__m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
__m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
__m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
__m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
__m256i ac_lo = __lasx_xvpickev_b(t1, t0);
__m256i bd_lo = __lasx_xvpickod_b(t1, t0);
__m256i ac_hi = __lasx_xvpickev_b(t3, t2);
__m256i bd_hi = __lasx_xvpickod_b(t3, t2);
__m256i p01l = __lasx_xvilvl_w(p1, p0);
__m256i p01h = __lasx_xvilvh_w(p1, p0);
__m256i p23l = __lasx_xvilvl_w(p3, p2);
__m256i p23h = __lasx_xvilvh_w(p3, p2);
__m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
__m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
__m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
__m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
__m256i b0 = __lasx_xvilvl_w(plh, pll);
__m256i g0 = __lasx_xvilvh_w(plh, pll);
__m256i r0 = __lasx_xvilvl_w(phh, phl);
__m256i a0 = __lasx_xvilvh_w(phh, phl);
__m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
__m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
__m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
__m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
a = v_uint8x32(b0);
b = v_uint8x32(g0);
c = v_uint8x32(r0);
d = v_uint8x32(a0);
a.val = __lasx_xvperm_w(a_pre, sh);
b.val = __lasx_xvperm_w(b_pre, sh);
c.val = __lasx_xvperm_w(c_pre, sh);
d.val = __lasx_xvperm_w(d_pre, sh);
}
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
{
__m256i bgr0 = __lasx_xvld(ptr, 0);
__m256i bgr1 = __lasx_xvld(ptr + 16, 0);
__m256i bgr2 = __lasx_xvld(ptr + 32, 0);
__m256i bgr3 = __lasx_xvld(ptr + 48, 0);
const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
__m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
__m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
__m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
__m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
__m256i t0 = __lasx_xvld(ptr, 0);
__m256i t1 = __lasx_xvld(ptr, 32);
__m256i t2 = __lasx_xvld(ptr, 64);
__m256i t3 = __lasx_xvld(ptr, 96);
__m256i p01l = __lasx_xvilvl_w(p1, p0);
__m256i p01h = __lasx_xvilvh_w(p1, p0);
__m256i p23l = __lasx_xvilvl_w(p3, p2);
__m256i p23h = __lasx_xvilvh_w(p3, p2);
const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
__m256i ac_lo = __lasx_xvpickev_h(t1, t0);
__m256i bd_lo = __lasx_xvpickod_h(t1, t0);
__m256i ac_hi = __lasx_xvpickev_h(t3, t2);
__m256i bd_hi = __lasx_xvpickod_h(t3, t2);
__m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
__m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
__m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
__m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
__m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
__m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
__m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
__m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
__m256i b0 = __lasx_xvilvl_w(plh, pll);
__m256i g0 = __lasx_xvilvh_w(plh, pll);
__m256i r0 = __lasx_xvilvl_w(phh, phl);
__m256i a0 = __lasx_xvilvh_w(phh, phl);
a = v_uint16x16(b0);
b = v_uint16x16(g0);
c = v_uint16x16(r0);
d = v_uint16x16(a0);
a.val = __lasx_xvperm_w(a_pre, sh);
b.val = __lasx_xvperm_w(b_pre, sh);
c.val = __lasx_xvperm_w(c_pre, sh);
d.val = __lasx_xvperm_w(d_pre, sh);
}
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
{
__m256i p0 = __lasx_xvld(ptr, 0);
__m256i p1 = __lasx_xvld(ptr + 8, 0);
__m256i p2 = __lasx_xvld(ptr + 16, 0);
__m256i p3 = __lasx_xvld(ptr + 24, 0);
__m256i p1 = __lasx_xvld(ptr, 32);
__m256i p2 = __lasx_xvld(ptr, 64);
__m256i p3 = __lasx_xvld(ptr, 96);
__m256i p01l = __lasx_xvilvl_w(p1, p0);
__m256i p01h = __lasx_xvilvh_w(p1, p0);
@ -2856,9 +2643,9 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
{
__m256i bgra0 = __lasx_xvld(ptr, 0);
__m256i bgra1 = __lasx_xvld(ptr + 4, 0);
__m256i bgra2 = __lasx_xvld(ptr + 8, 0);
__m256i bgra3 = __lasx_xvld(ptr + 12, 0);
__m256i bgra1 = __lasx_xvld(ptr, 32);
__m256i bgra2 = __lasx_xvld(ptr, 64);
__m256i bgra3 = __lasx_xvld(ptr, 96);
__m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
__m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);

File diff suppressed because it is too large Load Diff

@ -427,6 +427,7 @@ struct HWFeatures
g_hwFeatureNames[CPU_RVV] = "RVV";
g_hwFeatureNames[CPU_LSX] = "LSX";
g_hwFeatureNames[CPU_LASX] = "LASX";
}
@ -703,6 +704,10 @@ struct HWFeatures
have[CV_CPU_RVV] = true;
#endif
#if defined __loongarch_sx
have[CV_CPU_LSX] = true;
#endif
#if defined __loongarch_asx
have[CV_CPU_LASX] = true;
#endif

Loading…
Cancel
Save