Merge pull request #23929 from CNClareChen:4.x

* Optimize some function with lasx. Optimize some function with lasx. #23929 This patch optimizes some lasx functions and reduces the runtime of opencv_test_core from 662,238ms to 633603ms on the 3A5000 platform. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
1 year ago · d142a796d8
parent 996b6c37c7
commit d142a796d8
9 changed files with 2800 additions and 414 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -53,6 +53,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
+list(APPEND CPU_ALL_OPTIMIZATIONS LSX)
 list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)

@ -397,10 +398,16 @@ elseif(RISCV)
  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")

 elseif(LOONGARCH64)
+  ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp")
  ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
+  ocv_update(CPU_LSX_FLAGS_ON "-mlsx")
  ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
+  if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX")
+    set(CPU_BASELINE "LSX" CACHE  STRING "${HELP_CPU_BASELINE}")
+  else()
    set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
+  endif()

 endif()

--- a/cmake/checks/cpu_lsx.cpp
+++ b/cmake/checks/cpu_lsx.cpp
@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <lsxintrin.h>
+
+int test()
+{
+    const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f};
+    v4f32 val = (v4f32)__lsx_vld((const float*)(src), 0);
+    return __lsx_vpickve2gr_w(__lsx_vftint_w_s(val), 3);
+}
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -172,6 +172,11 @@
 #  define CV_MSA 1
 #endif

+#ifdef CV_CPU_COMPILE_LSX
+#  include <lsxintrin.h>
+#  define CV_LSX 1
+#endif
+
 #ifdef CV_CPU_COMPILE_LASX
 #  include <lasxintrin.h>
 #  define CV_LASX 1
@ -376,6 +381,10 @@ struct VZeroUpperGuard {
 #  define CV_RVV 0
 #endif

+#ifndef CV_LSX
+#  define CV_LSX 0
+#endif
+
 #ifndef CV_LASX
 #  define CV_LASX 0
 #endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -525,6 +525,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 1
+#  define CV_CPU_HAS_SUPPORT_LSX 1
+#  define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX))
+#  define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#else
+#  define CV_TRY_LSX 0
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX 0
+#  define CV_CPU_CALL_LSX(fn, args)
+#  define CV_CPU_CALL_LSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...)  CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
 #  define CV_TRY_LASX 1
 #  define CV_CPU_FORCE_LASX 1
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -281,7 +281,8 @@ namespace cv {

 #define CV_CPU_RVV              210

-#define CV_CPU_LASX             230
+#define CV_CPU_LSX              230
+#define CV_CPU_LASX             231

 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
@ -342,7 +343,8 @@ enum CpuFeatures {

    CPU_RVV             = 210,

-    CPU_LASX             = 230,
+    CPU_LSX             = 230,
+    CPU_LASX            = 231,

    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -206,7 +206,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif

-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@ -242,6 +242,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #include "opencv2/core/hal/intrin_rvv.hpp"
 #endif

+#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_lsx.hpp"
+
 #elif CV_LASX
    #if !defined(CV_FORCE_SIMD128_CPP)
    #define CV_FORCE_SIMD128_CPP 1
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@ -96,54 +96,22 @@ inline __m256d _v256_setall_pd(double f64)

 inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
 {
-    __m256i u8min = __lasx_xvreplgr2vr_h(0);
-    __m256i u8max = __lasx_xvreplgr2vr_h(255);
-    __m256i sat_a = __lasx_xvmax_h(a, u8min);
-            sat_a = __lasx_xvmin_h(sat_a, u8max);
-    __m256i sat_b = __lasx_xvmax_h(b, u8min);
-            sat_b = __lasx_xvmin_h(sat_b, u8max);
-    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+    return __lasx_xvssrarni_bu_h(b, a, 0);
 }

 inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
 {
-    __m256i s8min = __lasx_xvreplgr2vr_h(-128);
-    __m256i s8max = __lasx_xvreplgr2vr_h(127);
-    __m256i sat_a = __lasx_xvmax_h(a, s8min);
-            sat_a = __lasx_xvmin_h(sat_a, s8max);
-    __m256i sat_b = __lasx_xvmax_h(b, s8min);
-            sat_b = __lasx_xvmin_h(sat_b, s8max);
-    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+    return __lasx_xvssrarni_b_h(b, a, 0);
 }

 inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
 {
-    __m256i u16min = __lasx_xvreplgr2vr_w(0);
-    __m256i u16max = __lasx_xvreplgr2vr_w(0xffff);
-    __m256i sat_a = __lasx_xvmax_w(a, u16min);
-            sat_a = __lasx_xvmin_w(sat_a, u16max);
-    __m256i sat_b = __lasx_xvmax_w(b, u16min);
-            sat_b = __lasx_xvmin_w(sat_b, u16max);
-    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
-                                      0, 2, 4, 6, 8, 10, 12, 14);
-    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+    return __lasx_xvssrarni_hu_w(b, a, 0);
 }

 inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
 {
-    __m256i s16min = __lasx_xvreplgr2vr_w(-0x8000);
-    __m256i s16max = __lasx_xvreplgr2vr_w(0x7fff);
-    __m256i sat_a = __lasx_xvmax_w(a, s16min);
-            sat_a = __lasx_xvmin_w(sat_a, s16max);
-    __m256i sat_b = __lasx_xvmax_w(b, s16min);
-            sat_b = __lasx_xvmin_w(sat_b, s16max);
-    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
-                                      0, 2, 4, 6, 8, 10, 12, 14);
-    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+    return __lasx_xvssrarni_h_w(b, a, 0);
 }

 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
@ -191,7 +159,7 @@ inline _Tpvec v256_permute4x64(const _Tpvec& a)
 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }

 inline __m128i _v256_extract_high(const __m256i& v)
-{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31);
+{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
  return *((__m128i*)&temp256i); }

 inline __m128  _v256_extract_high(const __m256& v)
@ -211,10 +179,7 @@ inline __m128d _v256_extract_low(const __m256d& v)

 inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
 {
-    const __m256i maxv = __lasx_xvreplgr2vr_w(65535);
-    __m256i am = __lasx_xvmin_wu(a, maxv);
-    __m256i bm = __lasx_xvmin_wu(b, maxv);
-    return _lasx_packus_w(am, bm);
+    return __lasx_xvssrlrni_hu_w(b, a, 0);
 }

 template<int i>
@ -869,14 +834,11 @@ OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16,  __lasx_xvmul_h)

 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i ad = __lasx_xvsrai_h(a.val, 8);
-    __m256i bd = __lasx_xvsrai_h(b.val, 8);
-    __m256i p0 = __lasx_xvmul_h(a.val, b.val);
-    __m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8);
-
-    const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00);
-    return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01));
+    __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
+    return v_uint8x32(__lasx_xvpackev_b(p1, p0));
 }
+
 inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
 {
    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
@ -963,14 +925,7 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return

 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8,  v_int32x8,  w, __lasx_xvsra_w)
-
-inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift)
-{
-    __m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63);
-    __m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift);
-    return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift));
-}
-OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, _v256_srai_dx)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)


 /** Bitwise logic **/
@ -979,7 +934,7 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, _v256_srai_dx)
    OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
    OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
    inline _Tpvec operator ~ (const _Tpvec& a)                      \
-    { return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); }
+    { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }

 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32,    v, __lasx_xvreplgr2vr_w(-1))
@ -1224,11 +1179,9 @@ inline v_int8x32 v_reverse(const v_int8x32 &a)

 inline v_uint16x16 v_reverse(const v_uint16x16 &a)
 {
-    static const __m256i perm = _v256_setr_b(
-            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
-            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
-    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
-    return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1));
+    __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
+    vec = __lasx_xvshuf4i_w(vec, 0x4E);
+    return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
 }

 inline v_int16x16 v_reverse(const v_int16x16 &a)
@ -1236,8 +1189,8 @@ inline v_int16x16 v_reverse(const v_int16x16 &a)

 inline v_uint32x8 v_reverse(const v_uint32x8 &a)
 {
-    static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0);
-    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+    __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
+    return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
 }

 inline v_int32x8 v_reverse(const v_int32x8 &a)
@ -1266,17 +1219,19 @@ inline unsigned v_reduce_sum(const v_uint8x32& a)
    __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
+
 inline int v_reduce_sum(const v_int8x32& a)
 {
    __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
    __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
    __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
-    return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
+    return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
 }

-
 #define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
    inline sctype v_reduce_##func(const _Tpvec& a) \
    { \
@ -1344,7 +1299,8 @@ OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
 inline int v_reduce_sum(const v_int32x8& a)
 {
    __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
-    return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]);
+    __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
+    return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
 }

 inline unsigned v_reduce_sum(const v_uint32x8& a)
@ -1367,13 +1323,13 @@ inline float v_reduce_sum(const v_float32x8& a)

 inline uint64 v_reduce_sum(const v_uint64x4& a)
 {
-    uint64 *pa = (uint64*)&a;
-    return pa[0] + pa[1] + pa[2] + pa[3];
+    __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
+    return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
 }
 inline int64 v_reduce_sum(const v_int64x4& a)
 {
-    int64 *pa = (int64*)&a;
-    return pa[0] + pa[1] + pa[2] + pa[3];
+    __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
+    return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
 }
 inline double v_reduce_sum(const v_float64x4& a)
 {
@ -1406,7 +1362,8 @@ inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 {
@ -1414,7 +1371,8 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
@ -1445,36 +1403,13 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)

 /** Popcount **/
 inline v_uint8x32 v_popcount(const v_uint8x32& a)
-{
-    __m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-    __m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F);
-    return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)),
-                                     __lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask))));
-}
+{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
 inline v_uint16x16 v_popcount(const v_uint16x16& a)
-{
-    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff));
-}
+{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
 inline v_uint32x8 v_popcount(const v_uint32x8& a)
-{
-    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff));
-}
+{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
 inline v_uint64x4 v_popcount(const v_uint64x4& a)
-{
-    v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a));
-    uint8_t *pa = (uint8_t*)&atemp;
-    uint64 v[4];
-    for (int i = 0; i < 4; ++i) {
-        v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7];
-    }
-    return v_uint64x4(v[0], v[1], v[2], v[3]);
-}
+{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
 inline v_uint8x32 v_popcount(const v_int8x32& a)
 { return v_popcount(v_reinterpret_as_u8(a)); }
 inline v_uint16x16 v_popcount(const v_int16x16& a)
@ -1500,10 +1435,9 @@ OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)

 inline int v_signmask(const v_int8x32& a)
 {
-    int mask = 0;
-    int8_t *pa = (int8_t*)&a;
-    for( int i = 0; i < 32; i++ )
-        mask |= (reinterpret_int(pa[i]) < 0) << i;
+    __m256i result = __lasx_xvmskltz_b(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
    return mask;
 }
 inline int v_signmask(const v_uint8x32& a)
@ -1516,10 +1450,9 @@ inline int v_signmask(const v_uint16x16& a)

 inline int v_signmask(const v_int32x8& a)
 {
-    int mask = 0;
-    int *pa = (int*)&a;
-    for( int i = 0; i < 8; i++ )
-        mask |= (pa[i] < 0) << i;
+    __m256i result = __lasx_xvmskltz_w(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
    return mask;
 }
 inline int v_signmask(const v_uint32x8& a)
@ -1527,10 +1460,9 @@ inline int v_signmask(const v_uint32x8& a)

 inline int v_signmask(const v_int64x4& a)
 {
-    int mask = 0;
-    int64 *pa = (int64*)&a;
-    for( int i = 0; i < 4; i++ )
-        mask |= (pa[i] < 0) << i;
+    __m256i result = __lasx_xvmskltz_d(a.val);
+    int mask = __lasx_xvpickve2gr_d(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
    return mask;
 }
 inline int v_signmask(const v_uint64x4& a)
@ -1592,7 +1524,7 @@ OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)

 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
 {
-    return a * b + c;
+    return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
 }

 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@ -1601,17 +1533,10 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
 }

 inline v_float32x8 v_invsqrt(const v_float32x8& x)
-{
-    v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
-    v_float32x8 t  = v_float32x8(__lasx_xvfrsqrt_s(x.val));
-    t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half);
-    return t;
-}
+{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }

 inline v_float64x4 v_invsqrt(const v_float64x4& x)
-{
-    return v_float64x4(1., 1., 1., 1.) / v_sqrt(x);
-}
+{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }

 /** Absolute values **/
 #define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix)         \
@ -1629,28 +1554,18 @@ inline v_float64x4 v_abs(const v_float64x4& x)

 /** Absolute difference **/
 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }

 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
-{
-    v_int8x32 d = v_sub_wrap(a, b);
-    v_int8x32 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
-}
-
+{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
-{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
-
+{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 d = a - b;
-    v_int32x8 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
-}
+{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }

 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
 { return v_abs(a - b); }
@ -1740,28 +1655,8 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
 }

-// from (Mysticial and wim) https://stackoverflow.com/q/41144668
 inline v_float64x4 v_cvt_f64(const v_int64x4& v)
-{
-    // constants encoded as floating-point
-    __m256i magic_i_lo   = __lasx_xvreplgr2vr_d(0x4330000000000000);
-    __m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000);
-    __m256i magic_i_all  = __lasx_xvreplgr2vr_d(0x4530000080100000);
-    __m256d magic_d_all  = _lasx_256_castsi256_pd(magic_i_all);
-
-    // Blend the 32 lowest significant bits of v with magic_int_lo
-    __m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1);
-    __m256i v_lo         = __lasx_xvbitsel_v(magic_i_lo, v.val, mask);
-    // Extract the 32 most significant bits of v
-    __m256i v_hi         = __lasx_xvsrli_d(v.val, 32);
-    // Flip the msb of v_hi and blend with 0x45300000
-              v_hi         = __lasx_xvxor_v(v_hi, magic_i_hi32);
-    // Compute in double precision
-    __m256d v_hi_dbl     = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all);
-    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
-    __m256d result       = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo));
-    return v_float64x4(result);
-}
+{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }

 ////////////// Lookup table access ////////////////////

@ -1967,11 +1862,9 @@ inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
 inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
 {
    __m256i vzero = __lasx_xvreplgr2vr_w(0);
-    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
-                   _v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100));
-    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
-                   _v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100));
-    return v_int8x32(__lasx_xvperm_w(t2,
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
+    return v_int8x32(__lasx_xvperm_w(t1,
                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
 }
 inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
@ -1980,11 +1873,9 @@ inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
 inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
 {
    __m256i vzero = __lasx_xvreplgr2vr_w(0);
-    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
-                   _v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100));
-    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
-                   _v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100));
-    return v_int16x16(__lasx_xvperm_w(t2,
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
+    return v_int16x16(__lasx_xvperm_w(t1,
                        _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
 }
 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
@ -2018,24 +1909,21 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
 {
    __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
-    __m256i odd = __lasx_xvmulwod_d_w(a.val, b.val);
-    return v_int64x4(__lasx_xvadd_d(even, odd));
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
 }
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
-{ return v_dotprod(a, b) + c; }
+{
+    __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}

 // 8 >> 32
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00);
-    __m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m);
-    __m256i odd_a  = __lasx_xvsrli_h(a.val, 8);
-
-    __m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m);
-    __m256i odd_b  = __lasx_xvsrli_h(b.val, 8);
-
-    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
-    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    __m256i even  = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_bu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
    return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
@ -2043,14 +1931,10 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, con

 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
-    __m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8);
-    __m256i odd_a  = __lasx_xvsrai_h(a.val, 8);
-
-    __m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8);
-    __m256i odd_b  = __lasx_xvsrai_h(b.val, 8);
-
-    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
-    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    __m256i even  = __lasx_xvmulwev_h_b(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_b(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
+    __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
    return v_int32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
@ -2059,36 +1943,24 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
 {
-    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
-    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
-    __m256i mul0  = __lasx_xvilvl_h(mulhi, mullo);
-    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
-
-    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p13   = __lasx_xvsrli_d(mul0, 32);
-    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
-
-    __m256i p15_  = __lasx_xvadd_d(p02, p13);
-    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
-
-    return v_uint64x4(__lasx_xvadd_d(
-        __lasx_xvilvl_d(p9d_, p15_),
-        __lasx_xvilvh_d(p9d_, p15_)));
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
 { return v_dotprod_expand(a, b) + c; }

 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
-    __m256i prod  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
-    __m256i sign = __lasx_xvsrai_w(prod, 31);
-
-    __m256i lo = __lasx_xvilvl_w(sign, prod);
-    __m256i hi = __lasx_xvilvh_w(sign, prod);
-
-    return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo)));
+    __m256i even  = __lasx_xvmulwev_w_h(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_h(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
+    __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
+    return v_int64x4(__lasx_xvadd_d(prod0, prod1));
 }
+
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
 { return v_dotprod_expand(a, b) + c; }

@ -2126,20 +1998,11 @@ inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, c
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
 {
-    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
-    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
-    __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo);
-    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
-
-    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p13 = __lasx_xvsrli_d(mul0, 32);
-    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
-
-    __m256i p15_  = __lasx_xvadd_d(p02, p13);
-    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
-
-    return v_uint64x4(__lasx_xvadd_d(p15_, p9d_));
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
 { return v_dotprod_expand_fast(a, b) + c; }
@ -2261,12 +2124,7 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
 { return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }

 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
-{
-    __m256i t = __lasx_xvreplgr2vr_h(255);
-    __m256i a1 = __lasx_xvmin_hu(a.val, t);
-    __m256i b1 = __lasx_xvmin_hu(b.val, t);
-    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1)));
-}
+{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }

 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
@ -2277,12 +2135,7 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }

 inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
-{
-    const __m256i m = __lasx_xvreplgr2vr_h(255);
-    __m256i am = __lasx_xvmin_hu(a.val, m);
-            am = _v256_shuffle_odd_64(_lasx_packus_h(am, am));
-    v_store_low(ptr, v_uint8x32(am));
-}
+{ v_store_low(ptr, v_pack(a, a)); }

 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack_u(a, a)); }
@ -2290,45 +2143,43 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
 template<int n> inline
 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
 {
-    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
-    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
 }

 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
 {
-    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 template<int n> inline
 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
 }

 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 template<int n> inline
 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
+    return v_int8x32(_v256_shuffle_odd_64(res));
 }

 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 // 32
@ -2346,67 +2197,51 @@ inline void v_pack_store(short* ptr, const v_int32x8& a)

 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
 {
-    const __m256i m = __lasx_xvreplgr2vr_w(65535);
-    __m256i am = __lasx_xvmin_wu(a.val, m);
-            am = _v256_shuffle_odd_64(_lasx_packus_w(am, am));
-    v_store_low(ptr, v_uint16x16(am));
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
 { v_store_low(ptr, v_pack_u(a, a)); }

-
 template<int n> inline
 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
-{
-    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
-    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
-}
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }

 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
 {
-    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 template<int n> inline
 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }

 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
 {
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 template<int n> inline
 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }

 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
 {
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
+    __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
 }

 // 64
 // Non-saturating pack
 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
 {
-    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
-    __m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08);
-    __m256i ab = __lasx_xvilvl_d(b0, a0);
+    __m256i ab = __lasx_xvpickev_w(b.val, a.val);
    return v_uint32x8(_v256_shuffle_odd_64(ab));
 }

@ -2424,31 +2259,19 @@ inline void v_pack_store(int* ptr, const v_int64x4& b)

 template<int n> inline
 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
-{
-    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }

 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
-{
-    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
-}
+{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); }

 template<int n> inline
 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
-{
-    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }

 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
-{
-    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
-}
+{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); }

 // pack boolean
 inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
@ -2583,63 +2406,48 @@ template<int i>
 inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
 { return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }

-
 ///////////////////// load deinterleave /////////////////////////////

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 32, 0);
-
-    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
-    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint8x32(a0);
-    b = v_uint8x32(b0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_b(t1, t0);
+    __m256i p1 = __lasx_xvpickod_b(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }

 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 16, 0);
-
-    const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
-    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint16x16(a0);
-    b = v_uint16x16(b0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_h(t1, t0);
+    __m256i p1 = __lasx_xvpickod_h(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }

 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 8, 0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);

-    //const int sh = 0+2*4+1*16+3*64;
-    __m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8);
-    __m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint32x8(a0);
-    b = v_uint32x8(b0);
+    __m256i p0 = __lasx_xvpickev_w(t1, t0);
+    __m256i p1 = __lasx_xvpickod_w(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }

 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
 {
    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 4, 0);
+    __m256i ab1 = __lasx_xvld(ptr, 32);

    __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
    __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
@ -2652,8 +2460,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b
 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
 {
    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);

    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2686,8 +2494,8 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b,
 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
 {
    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);

    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2717,8 +2525,8 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
 {
    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 8, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 16, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);

    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@ -2741,8 +2549,8 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
 {
    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 4, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 8, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);

    __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
    __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
@ -2758,79 +2566,58 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
 {
-    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
-    __m256i bgr3 = __lasx_xvld(ptr + 96, 0);
-    const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                    0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-
-    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
-    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
-    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
-    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
-
-    __m256i p01l = __lasx_xvilvl_w(p1, p0);
-    __m256i p01h = __lasx_xvilvh_w(p1, p0);
-    __m256i p23l = __lasx_xvilvl_w(p3, p2);
-    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);

-    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
-    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
-    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
-    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_b(t3, t2);

-    __m256i b0 = __lasx_xvilvl_w(plh, pll);
-    __m256i g0 = __lasx_xvilvh_w(plh, pll);
-    __m256i r0 = __lasx_xvilvl_w(phh, phl);
-    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+    __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);

-    a = v_uint8x32(b0);
-    b = v_uint8x32(g0);
-    c = v_uint8x32(r0);
-    d = v_uint8x32(a0);
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
 }

 inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
 {
-    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr3 = __lasx_xvld(ptr + 48, 0);
-    const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-                                    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
-    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
-    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
-    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);

-    __m256i p01l = __lasx_xvilvl_w(p1, p0);
-    __m256i p01h = __lasx_xvilvh_w(p1, p0);
-    __m256i p23l = __lasx_xvilvl_w(p3, p2);
-    __m256i p23h = __lasx_xvilvh_w(p3, p2);
-
-    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
-    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
-    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
-    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_h(t3, t2);

-    __m256i b0 = __lasx_xvilvl_w(plh, pll);
-    __m256i g0 = __lasx_xvilvh_w(plh, pll);
-    __m256i r0 = __lasx_xvilvl_w(phh, phl);
-    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+    __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);

-    a = v_uint16x16(b0);
-    b = v_uint16x16(g0);
-    c = v_uint16x16(r0);
-    d = v_uint16x16(a0);
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
 }

 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
 {
    __m256i p0 = __lasx_xvld(ptr, 0);
-    __m256i p1 = __lasx_xvld(ptr + 8, 0);
-    __m256i p2 = __lasx_xvld(ptr + 16, 0);
-    __m256i p3 = __lasx_xvld(ptr + 24, 0);
+    __m256i p1 = __lasx_xvld(ptr, 32);
+    __m256i p2 = __lasx_xvld(ptr, 64);
+    __m256i p3 = __lasx_xvld(ptr, 96);

    __m256i p01l = __lasx_xvilvl_w(p1, p0);
    __m256i p01h = __lasx_xvilvh_w(p1, p0);
@ -2856,9 +2643,9 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
 {
    __m256i bgra0 = __lasx_xvld(ptr, 0);
-    __m256i bgra1 = __lasx_xvld(ptr + 4, 0);
-    __m256i bgra2 = __lasx_xvld(ptr + 8, 0);
-    __m256i bgra3 = __lasx_xvld(ptr + 12, 0);
+    __m256i bgra1 = __lasx_xvld(ptr, 32);
+    __m256i bgra2 = __lasx_xvld(ptr, 64);
+    __m256i bgra3 = __lasx_xvld(ptr, 96);

    __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
    __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
--- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -427,6 +427,7 @@ struct HWFeatures

        g_hwFeatureNames[CPU_RVV] = "RVV";

+        g_hwFeatureNames[CPU_LSX]  = "LSX";
        g_hwFeatureNames[CPU_LASX] = "LASX";
    }

@ -703,6 +704,10 @@ struct HWFeatures
        have[CV_CPU_RVV] = true;
    #endif

+    #if defined __loongarch_sx
+        have[CV_CPU_LSX] = true;
+    #endif
+
    #if defined __loongarch_asx
        have[CV_CPU_LASX] = true;
    #endif