diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index cc0b5f8216..b1751e3987 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
+list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
 
 ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@@ -380,6 +381,12 @@ elseif(RISCV)
   set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}")
   set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}")
 
+elseif(LOONGARCH64)
+  ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
+  ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
+  set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
+
 endif()
 
 # Helper values for cmake-gui
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 7f229cde96..8fe89b3fe0 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
   set(MIPS 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)")
   set(RISCV 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)")
+  set(LOONGARCH64 1)
 else()
   if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR)
     message(WARNING "OpenCV: unrecognized target processor configuration")
diff --git a/cmake/checks/cpu_lasx.cpp b/cmake/checks/cpu_lasx.cpp
new file mode 100644
index 0000000000..9d3b2a8725
--- /dev/null
+++ b/cmake/checks/cpu_lasx.cpp
@@ -0,0 +1,23 @@
+#include <stdio.h>
+
+#if defined(__loongarch_asx)
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
+#if defined CV_LASX
+int test()
+{
+    const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
+    v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0);
+    return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7);
+}
+#else
+#error "LASX is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index 12e4cb47b8..3235b6317e 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -172,6 +172,11 @@
 #  define CV_MSA 1
 #endif
 
+#ifdef CV_CPU_COMPILE_LASX
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
 #ifdef __EMSCRIPTEN__
 #  define CV_WASM_SIMD 1
 #  include <wasm_simd128.h>
@@ -370,3 +375,7 @@ struct VZeroUpperGuard {
 #ifndef CV_RVV
 #  define CV_RVV 0
 #endif
+
+#ifndef CV_LASX
+#  define CV_LASX 0
+#endif
diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h
index 91b853de0c..41fc9d50fa 100644
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -525,5 +525,26 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 1
+#  define CV_CPU_HAS_SUPPORT_LASX 1
+#  define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
+#  define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#else
+#  define CV_TRY_LASX 0
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX 0
+#  define CV_CPU_CALL_LASX(fn, args)
+#  define CV_CPU_CALL_LASX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...)  CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 21e3792162..95dc81fb46 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -279,6 +279,8 @@ namespace cv {
 
 #define CV_CPU_RVV              210
 
+#define CV_CPU_LASX             230
+
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
 #define CV_CPU_AVX512_COMMON    257
@@ -336,6 +338,8 @@ enum CpuFeatures {
 
     CPU_RVV             = 210,
 
+    CPU_LASX             = 230,
+
     CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
     CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
     CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index c12140bbf8..6eac27e763 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
 #include "opencv2/core/hal/intrin_rvv.hpp"
+
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
 #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
+
+#elif CV_LASX
+    #if !defined(CV_FORCE_SIMD128_CPP)
+    #define CV_FORCE_SIMD128_CPP 1
+    #endif
+#include "opencv2/core/hal/intrin_cpp.hpp"
+
 #else
 
 #include "opencv2/core/hal/intrin_cpp.hpp"
@@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #endif
 
+#if CV_LASX
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_lasx.hpp"
+
+#endif
+
 //! @cond IGNORED
 
 namespace cv {
diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
new file mode 100644
index 0000000000..37f2e3f81d
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@@ -0,0 +1,3236 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LASX_HPP
+#define OPENCV_HAL_INTRIN_LASX_HPP
+
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                    char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                    char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                    char v30, char v31)
+{
+    return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                           v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+                           v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+                           v30, v31 };
+}
+
+inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                   char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                   char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                   char v30, char v31)
+{
+    return (__m256i)v32i8{ v31, v30,
+                           v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
+                           v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
+                           v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
+                            short v8,  short v9, short v10, short v11, short v12, short v13, short v14, short v15)
+{
+    return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setall_w(int v0)
+{
+    return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
+}
+
+inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v0, v1, v2, v3 };
+}
+
+inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v3, v2, v1, v0 };
+}
+
+inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7)
+{
+    return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256 _v256_setall_ps(float f32)
+{
+    return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
+}
+
+inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3)
+{
+    return (__m256d)v4f64{ v0, v1, v2, v3 };
+}
+
+inline __m256d _v256_setall_pd(double f64)
+{
+    return (__m256d)v4f64{ f64, f64, f64, f64 };
+}
+
+inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
+{
+    __m256i u8min = __lasx_xvreplgr2vr_h(0);
+    __m256i u8max = __lasx_xvreplgr2vr_h(255);
+    __m256i sat_a = __lasx_xvmax_h(a, u8min);
+            sat_a = __lasx_xvmin_h(sat_a, u8max);
+    __m256i sat_b = __lasx_xvmax_h(b, u8min);
+            sat_b = __lasx_xvmin_h(sat_b, u8max);
+    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+}
+
+inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
+{
+    __m256i s8min = __lasx_xvreplgr2vr_h(-128);
+    __m256i s8max = __lasx_xvreplgr2vr_h(127);
+    __m256i sat_a = __lasx_xvmax_h(a, s8min);
+            sat_a = __lasx_xvmin_h(sat_a, s8max);
+    __m256i sat_b = __lasx_xvmax_h(b, s8min);
+            sat_b = __lasx_xvmin_h(sat_b, s8max);
+    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+}
+
+inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
+{
+    __m256i u16min = __lasx_xvreplgr2vr_w(0);
+    __m256i u16max = __lasx_xvreplgr2vr_w(0xffff);
+    __m256i sat_a = __lasx_xvmax_w(a, u16min);
+            sat_a = __lasx_xvmin_w(sat_a, u16max);
+    __m256i sat_b = __lasx_xvmax_w(b, u16min);
+            sat_b = __lasx_xvmin_w(sat_b, u16max);
+    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
+                                      0, 2, 4, 6, 8, 10, 12, 14);
+    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+}
+
+inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
+{
+    __m256i s16min = __lasx_xvreplgr2vr_w(-0x8000);
+    __m256i s16max = __lasx_xvreplgr2vr_w(0x7fff);
+    __m256i sat_a = __lasx_xvmax_w(a, s16min);
+            sat_a = __lasx_xvmin_w(sat_a, s16max);
+    __m256i sat_b = __lasx_xvmax_w(b, s16min);
+            sat_b = __lasx_xvmin_w(sat_b, s16max);
+    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
+                                      0, 2, 4, 6, 8, 10, 12, 14);
+    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+}
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return __lasx_xvpermi_d(v, 0xd8); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
+
+//LASX: only use for permute WITHOUT zero clearing
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return __lasx_xvpermi_q(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return __lasx_xvpermi_d(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31);
+  return *((__m128i*)&temp256i); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return __m128(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return __m128d(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return *((__m128i*)&v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return __m128(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return __m128d(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    const __m256i maxv = __lasx_xvreplgr2vr_w(65535);
+    __m256i am = __lasx_xvmin_wu(a, maxv);
+    __m256i bm = __lasx_xvmin_wu(b, maxv);
+    return _lasx_packus_w(am, bm);
+}
+
+template<int i>
+inline int _v256_extract_b(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_b(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_h(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_h(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_w(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_w(a, i);
+}
+
+template<int i>
+inline int64 _v256_extract_d(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_d(a, i);
+}
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x32() {}
+
+    uchar get0() const {
+        uchar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x32() {}
+
+    schar get0() const {
+        schar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x16() {}
+
+    ushort get0() const {
+        ushort des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x16() {}
+
+    short get0() const {
+        short des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x8() {}
+
+    unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x8() {}
+
+    int get0() const { return __lasx_xvpickve2gr_w(val, 0); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x8() {}
+
+    float get0() const {
+        float des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int get0toint() const {
+        int des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x4() {}
+
+    uint64 get0() const
+    {
+        return __lasx_xvpickve2gr_du(val, 0);
+    }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _v256_setr_d(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x4() {}
+
+    int64 get0() const
+    {
+        return __lasx_xvpickve2gr_d(val, 0);
+    }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _v256_setr_pd(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x4() {}
+
+    double get0() const {
+        double des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int64 get0toint64() const {
+        int64 des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp)                   \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = __lsx_vld(ptr, 0);                             \
+        return _Tpvec(*((__m256i*)&v128));                            \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = __lsx_vld(ptr0, 0);                             \
+        __m128i vhi = __lsx_vld(ptr1, 0);                             \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                  \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4,   int64)
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)          \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        __m128i v128 = __lsx_vld(ptr, 0);                                 \
+        return _Tpvec(*((__m256i*)&v128));                                \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = __lsx_vld(ptr0, 0);                                 \
+        halfreg vhi = __lsx_vld(ptr1, 0);                                 \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                      \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i)
+
+
+inline __m256i _lasx_256_castps_si256(const __m256& v)
+{ return __m256i(v); }
+
+inline __m256i _lasx_256_castpd_si256(const __m256d& v)
+{ return __m256i(v); }
+
+#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                         \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                     \
+    { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); }                  \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
+
+OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32,  uchar,    u8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int8x32,   schar,    s8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort,   u16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int16x16,  short,    s16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8,  unsigned, u32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int32x8,   int,      s32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4,  uint64,   u64, d, long int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int64x4,   int64,    s64, d, long int)
+
+
+inline __m256 _lasx_256_castsi256_ps(const __m256i &v)
+{ return __m256(v); }
+
+inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
+{ return __m256d(v); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                 \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                           \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                             \
+    { return _Tpvec(_v256_setall_##zsuffix(v)); }                   \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float,  f32, ps, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
+
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); }        \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
+
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32,  b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32,   b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16,  h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8,   w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4,  d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4,   d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
+
+
+// shuffle
+// todo: emulate 64bit
+#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8,   w)
+
+template<int m>
+inline v_float32x8 v256_shuffle(const v_float32x8 &a)
+{ return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
+
+template<int m>
+inline v_float64x4 v256_shuffle(const v_float64x4 &a)
+{
+    int imm8 = m & 0b0001;  //0 or 1
+    if (m & 0x0b0010) imm8 |= 0b0100;
+    //else imm8 |= 0b0000;
+    if (m & 0x0b0100) imm8 |= 0b110000;  //2 or 3
+    else imm8 |= 0b100000;
+    if (m & 0x0b1000) imm8 |= 0b11000000;
+    else imm8 |= 0b10000000;
+
+    return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
+}
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x03>(a, b); }
+
+inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm)
+{
+    if (imm == 8) {
+        return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2
+    } else {
+        __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
+    }
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<0x1b>(a); }
+
+
+// ZIP
+#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec)                             \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x02>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x13>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32,  __lasx_xvsadd_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32,  __lasx_xvssub_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32,   __lasx_xvsadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32,   __lasx_xvssub_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16,  __lasx_xvsadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16,  __lasx_xvssub_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8,  __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8,  __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8,  __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8,   __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8,   __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8,   __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4,  __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4,  __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4,   __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4,   __lasx_xvsub_d)
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_h(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_int16x16(_lasx_packs_w(p0, p1));
+}
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32,  __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32,   __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16,  __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32,  __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32,   __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16,  __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16,  __lasx_xvmul_h)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i ad = __lasx_xvsrai_h(a.val, 8);
+    __m256i bd = __lasx_xvsrai_h(b.val, 8);
+    __m256i p0 = __lasx_xvmul_h(a.val, b.val);
+    __m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8);
+
+    const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00);
+    return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01));
+}
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
+    __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                             \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                                        \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                                        \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                                        \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                                        \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }                            \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
+
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8,  v_int32x8,  w, __lasx_xvsra_w)
+
+inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift)
+{
+    __m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63);
+    __m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift);
+    return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift));
+}
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, _v256_srai_dx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix)   \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                      \
+    { return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16,  v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4,   v, __lasx_xvreplgr2vr_d(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4,    v, __lasx_xvreplgr2vr_d(-1))
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)                         \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                            \
+    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }                    \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                              \
+    { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast)      \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast)      \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                     \
+    { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
+
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8,  v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4,  v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec)                                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
+
+inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a == b); }                                          \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)   \
+    { return b > a; }                                              \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a < b); }                                           \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)           \
+    {                                                                        \
+        return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val));                \
+    }                                                                        \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); }                 \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec)                                  \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32,  v_int8x32,  b, bu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8,  v_int32x8,  w, wu)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix)         \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); }       \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix)              \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<,  xvfcmp_clt, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
+
+inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32,  __lasx_xvmin_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32,  __lasx_xvmax_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32,   __lasx_xvmin_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32,   __lasx_xvmax_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16,  __lasx_xvmin_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16,  __lasx_xvmax_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8,  __lasx_xvmin_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8,  __lasx_xvmax_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8,   __lasx_xvmin_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8,   __lasx_xvmax_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x21>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
+    return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x03>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
+    return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+    enum {IMM_R = (16 - imm) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
+    return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
+    return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
+}
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast)    \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
+                                       v_reinterpret_as_u8(b));   \
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
+        return _Tpvec(cast(ret.val));                             \
+    }
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float64x4, _lasx_256_castsi256_pd)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
+
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _v256_setr_b(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
+    return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    static const __m256i perm = _v256_setr_b(
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
+    return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+}
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
+    return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]);
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, __lsx_vbsrl_v(val,8));    \
+        val = intrin(val, __lsx_vbsrl_v(val,4));    \
+        val = intrin(val, __lsx_vbsrl_v(val,2));    \
+        val = intrin(val, __lsx_vbsrl_v(val,1));    \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                   \
+    {                                                                \
+        __m128i v0 = _v256_extract_low(a.val);                       \
+        __m128i v1 = _v256_extract_high(a.val);                      \
+        v0 = intrin(v0, v1);                                         \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 2));                       \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                   \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                      \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                      \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin)                 \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
+        float *fvalue = (float*)&v0;                                  \
+        return fvalue[0];                                             \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
+    return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    float result = 0;
+    float *pa = (float*)&a;
+    for (int i = 0; i < 2; ++i) {
+        result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
+    }
+    return result;
+}
+
+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    uint64 *pa = (uint64*)&a;
+    return pa[0] + pa[1] + pa[2] + pa[3];
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    int64 *pa = (int64*)&a;
+    return pa[0] + pa[1] + pa[2] + pa[3];
+}
+inline double v_reduce_sum(const v_float64x4& a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1] + pa[2] + pa[3];
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    float *pa = (float*)&a;
+    float *pb = (float*)&b;
+    float *pc = (float*)&c;
+    float *pd = (float*)&d;
+
+    float v0 = pa[0] + pa[1] + pa[2] + pa[3];
+    float v1 = pb[0] + pb[1] + pb[2] + pb[3];
+    float v2 = pc[0] + pc[1] + pc[2] + pc[3];
+    float v3 = pd[0] + pd[1] + pd[2] + pd[3];
+    float v4 = pa[4] + pa[5] + pa[6] + pa[7];
+    float v5 = pb[4] + pb[5] + pb[6] + pb[7];
+    float v6 = pc[4] + pc[5] + pc[6] + pc[7];
+    float v7 = pd[4] + pd[5] + pd[6] + pd[7];
+    return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+}
+inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+}
+inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
+{
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+}
+inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
+{
+    v_float32x8 a_b = a - b;
+    return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{
+    __m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+                                         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+    __m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F);
+    return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)),
+                                     __lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask))));
+}
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff));
+}
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff));
+}
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{
+    v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a));
+    uint8_t *pa = (uint8_t*)&atemp;
+    uint64 v[4];
+    for (int i = 0; i < 4; ++i) {
+        v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7];
+    }
+    return v_uint64x4(v[0], v[1], v[2], v[3]);
+}
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+/** Mask **/
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+inline int v_signmask(const v_int8x32& a)
+{
+    int mask = 0;
+    int8_t *pa = (int8_t*)&a;
+    for( int i = 0; i < 32; i++ )
+        mask |= (reinterpret_int(pa[i]) < 0) << i;
+    return mask;
+}
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+    int mask = 0;
+    int *pa = (int*)&a;
+    for( int i = 0; i < 8; i++ )
+        mask |= (pa[i] < 0) << i;
+    return mask;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_int64x4& a)
+{
+    int mask = 0;
+    int64 *pa = (int64*)&a;
+    for( int i = 0; i < 4; i++ )
+        mask |= (pa[i] < 0) << i;
+    return mask;
+}
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_float64x4& a)
+{ return v_signmask(*(v_int64x4*)(&a)); }
+
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)     \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)  \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                      \
+    { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); }                         \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)            \
+    { return v_fma(a, a, b * b); }                                             \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
+
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+    v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
+    v_float32x8 t  = v_float32x8(__lasx_xvfrsqrt_s(x.val));
+    t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half);
+    return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+    return v_float64x4(1., 1., 1., 1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)        \
+    { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
+
+OPENCV_HAL_IMPL_LASX_ABS(int8x32,  b)
+OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
+OPENCV_HAL_IMPL_LASX_ABS(int32x8,  w)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub_wrap(a, b);
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftint_w_s(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ __m256i t = __lasx_xvftint_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
+    return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120
+}
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(__lasx_xvffint_s_w(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val);  //warnning: order of a,b is diff from instruction xvfcvt.s.d
+    return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvffintl_d_w(alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvffintl_d_w(ahigh));
+}
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
+}
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{
+    // constants encoded as floating-point
+    __m256i magic_i_lo   = __lasx_xvreplgr2vr_d(0x4330000000000000);
+    __m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000);
+    __m256i magic_i_all  = __lasx_xvreplgr2vr_d(0x4530000080100000);
+    __m256d magic_d_all  = _lasx_256_castsi256_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1);
+    __m256i v_lo         = __lasx_xvbitsel_v(magic_i_lo, v.val, mask);
+    // Extract the 32 most significant bits of v
+    __m256i v_hi         = __lasx_xvsrli_d(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+              v_hi         = __lasx_xvxor_v(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m256d v_hi_dbl     = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m256d result       = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo));
+    return v_float64x4(result);
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x32 v256_lut(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]],
+                                  tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]],
+                                  tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]],
+                                  tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
+                                  tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]],
+                                  tab[idx[30]], tab[idx[31]]));
+}
+inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]),
+                                  *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]),
+                                  *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]),
+                                  *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
+                                  *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]),
+                                  *(const short*)(tab + idx[15])));
+}
+inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7])));
+}
+inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x16 v256_lut(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]],
+                                   tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]],
+                                   tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]],
+                                   tab[idx[15]]));
+}
+inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                   *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                   *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                   *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                   *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+
+}
+inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x8 v256_lut(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x4 v256_lut(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
+inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x8 v256_lut(const float* tab, const int* idx)
+{
+    return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x4 v256_lut(const double* tab, const int* idx)
+{
+    return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx)
+{ return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); }
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy01, xy45, xy23, xy67;
+    xy01 = __lsx_vld(tab + idx[0], 0);
+    xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10);
+    xy45 = __lsx_vld(tab + idx[4], 0);
+    xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10);
+    __m256i xy0145 = _v256_combine(xy01, xy45);
+    xy23 = __lsx_vld(tab + idx[2], 0);
+    xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10);
+    xy67 = __lsx_vld(tab + idx[6], 0);
+    xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10);
+    __m256i xy2367 = _v256_combine(xy23, xy67);
+
+    __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
+    __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
+
+    x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
+    y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    //int CV_DECL_ALIGNED(32) idx[4];
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy2 = __lsx_vld(tab + idx[2], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    __m128i xy3 = __lsx_vld(tab + idx[3], 0);
+    __m256i xy02 = _v256_combine(xy0, xy2);
+    __m256i xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
+    y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
+}
+
+inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
+}
+inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
+}
+inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+}
+inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
+}
+inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
+                   _v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100));
+    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
+                   _v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100));
+    return v_int8x32(__lasx_xvperm_w(t2,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
+                   _v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100));
+    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
+                   _v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100));
+    return v_int16x16(__lasx_xvperm_w(t2,
+                        _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvperm_w(vec.val,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
+                         _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
+    __m256i odd = __lasx_xvmulwod_d_w(a.val, b.val);
+    return v_int64x4(__lasx_xvadd_d(even, odd));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00);
+    __m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m);
+    __m256i odd_a  = __lasx_xvsrli_h(a.val, 8);
+
+    __m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m);
+    __m256i odd_b  = __lasx_xvsrli_h(b.val, 8);
+
+    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
+    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8);
+    __m256i odd_a  = __lasx_xvsrai_h(a.val, 8);
+
+    __m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8);
+    __m256i odd_b  = __lasx_xvsrai_h(b.val, 8);
+
+    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
+    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    return v_int32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
+    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
+    __m256i mul0  = __lasx_xvilvl_h(mulhi, mullo);
+    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
+
+    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
+    __m256i p13   = __lasx_xvsrli_d(mul0, 32);
+    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
+    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
+
+    __m256i p15_  = __lasx_xvadd_d(p02, p13);
+    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
+
+    return v_uint64x4(__lasx_xvadd_d(
+        __lasx_xvilvl_d(p9d_, p15_),
+        __lasx_xvilvh_d(p9d_, p15_)));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
+    __m256i sign = __lasx_xvsrai_w(prod, 31);
+
+    __m256i lo = __lasx_xvilvl_w(sign, prod);
+    __m256i hi = __lasx_xvilvh_w(sign, prod);
+
+    return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo)));
+}
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
+    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
+    __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo);
+    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
+
+    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
+    __m256i p13 = __lasx_xvsrli_d(mul0, 32);
+    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
+    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
+
+    __m256i p15_  = __lasx_xvadd_d(p02, p13);
+    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
+
+    return v_uint64x4(__lasx_xvadd_d(p15_, p9d_));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
+    __m256i sign = __lasx_xvsrai_w(prod, 31);
+    __m256i lo = __lasx_xvilvl_w(sign, prod);
+    __m256i hi = __lasx_xvilvh_w(sign, prod);
+    return v_int64x4(__lasx_xvadd_d(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+
+#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
+    v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to)           \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val));                \
+        __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val));                \
+        __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val));                \
+        __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val));                \
+        b0.val = cast_to(__lasx_xvilvl_d(t1, t0));                              \
+        b1.val = cast_to(__lasx_xvilvh_d(t1, t0));                              \
+        b2.val = cast_to(__lasx_xvilvl_d(t3, t2));                              \
+        b3.val = cast_to(__lasx_xvilvh_d(t3, t2));                              \
+    }
+
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8,  OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1,
+                           const v_float32x8 &a2, const v_float32x8 &a3,
+                           v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
+{
+    __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
+    __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
+    b0.val = __m256(__lasx_xvilvl_d(t1, t0));
+    b1.val = __m256(__lasx_xvilvh_d(t1, t0));
+    b2.val = __m256(__lasx_xvilvl_d(t3, t2));
+    b3.val = __m256(__lasx_xvilvh_d(t3, t2));
+}
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
+    {                                                                 \
+        b0.val = intrin(a.val);                                       \
+        b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11));        \
+    }                                                                 \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+    { return _Tpwvec(intrin(a.val)); }                                \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+    { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                   \
+    {                                                                 \
+        __m128i a = __lsx_vld(ptr, 0);                                \
+        return _Tpwvec(intrin(*((__m256i*)&a)));                      \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    __lasx_vext2xv_hu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32,   v_int16x16,  schar,    __lasx_vext2xv_h_b)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   __lasx_vext2xv_wu_hu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16,  v_int32x8,   short,    __lasx_vext2xv_w_h)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, __lasx_vext2xv_du_wu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8,   v_int64x4,   int,      __lasx_vext2xv_d_w)
+
+#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = __lsx_vld(ptr, 0);                       \
+        return _Tpvec(intrin(*((__m256i*)&a)));              \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8,  schar, __lasx_vext2xv_w_b)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i t = __lasx_xvreplgr2vr_h(255);
+    __m256i a1 = __lasx_xvmin_hu(a.val, t);
+    __m256i b1 = __lasx_xvmin_hu(b.val, t);
+    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1)));
+}
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    const __m256i m = __lasx_xvreplgr2vr_h(255);
+    __m256i am = __lasx_xvmin_hu(a.val, m);
+            am = _v256_shuffle_odd_64(_lasx_packus_h(am, am));
+    v_store_low(ptr, v_uint8x32(am));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    const __m256i m = __lasx_xvreplgr2vr_w(65535);
+    __m256i am = __lasx_xvmin_wu(a.val, m);
+            am = _v256_shuffle_odd_64(_lasx_packus_w(am, am));
+    v_store_low(ptr, v_uint16x16(am));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
+    __m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08);
+    __m256i ab = __lasx_xvilvl_d(b0, a0);
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _lasx_packs_h(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
+    return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+    __m256i ef = _lasx_packs_w(e.val, f.val);
+    __m256i gh = _lasx_packs_w(g.val, h.val);
+
+    __m256i abcd = _lasx_packs_w(ab, cd);
+    __m256i efgh = _lasx_packs_w(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
+
+    __m256i rev = _v256_alignr_b(pkall, pkall, 8);
+    return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec)                    \
+    template<int s>                                             \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)   \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
+
+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_b<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_h<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_w<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_d<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = __lasx_xvreplgr2vr_w((char)i);
+    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr + 32, 0);
+
+    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
+    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
+    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint8x32(a0);
+    b = v_uint8x32(b0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr + 16, 0);
+
+    const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
+    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
+    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint16x16(a0);
+    b = v_uint16x16(b0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr + 8, 0);
+
+    //const int sh = 0+2*4+1*16+3*64;
+    __m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8);
+    __m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8);
+    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint32x8(a0);
+    b = v_uint32x8(b0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr + 4, 0);
+
+    __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+
+    const __m256i
+    sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                        0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                        1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                        2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                      0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                      2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                      4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 8, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 16, 0);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+    __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
+
+    b0 = __lasx_xvshuf4i_w(b0, 0x6c);
+    g0 = __lasx_xvshuf4i_w(g0, 0xb1);
+    r0 = __lasx_xvshuf4i_w(r0, 0xc6);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 4, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 8, 0);
+
+    __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
+    __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
+    __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
+    __m256i b0 = __lasx_xvilvl_d(s20r, s01);
+    __m256i g0 = _v256_alignr_b(s12, s01, 8);
+    __m256i r0 = __lasx_xvilvh_d(s12, s20r);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
+    __m256i bgr3 = __lasx_xvld(ptr + 96, 0);
+    const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                    0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
+    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
+    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
+    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
+
+    __m256i p01l = __lasx_xvilvl_w(p1, p0);
+    __m256i p01h = __lasx_xvilvh_w(p1, p0);
+    __m256i p23l = __lasx_xvilvl_w(p3, p2);
+    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+
+    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
+    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
+    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
+    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_w(plh, pll);
+    __m256i g0 = __lasx_xvilvh_w(plh, pll);
+    __m256i r0 = __lasx_xvilvl_w(phh, phl);
+    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+    d = v_uint8x32(a0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
+    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
+    __m256i bgr3 = __lasx_xvld(ptr + 48, 0);
+    const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+                                    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
+    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
+    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
+    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
+
+    __m256i p01l = __lasx_xvilvl_w(p1, p0);
+    __m256i p01h = __lasx_xvilvh_w(p1, p0);
+    __m256i p23l = __lasx_xvilvl_w(p3, p2);
+    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+
+    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
+    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
+    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
+    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_w(plh, pll);
+    __m256i g0 = __lasx_xvilvh_w(plh, pll);
+    __m256i r0 = __lasx_xvilvl_w(phh, phl);
+    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+    d = v_uint16x16(a0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
+{
+    __m256i p0 = __lasx_xvld(ptr, 0);
+    __m256i p1 = __lasx_xvld(ptr + 8, 0);
+    __m256i p2 = __lasx_xvld(ptr + 16, 0);
+    __m256i p3 = __lasx_xvld(ptr + 24, 0);
+
+    __m256i p01l = __lasx_xvilvl_w(p1, p0);
+    __m256i p01h = __lasx_xvilvh_w(p1, p0);
+    __m256i p23l = __lasx_xvilvl_w(p3, p2);
+    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+
+    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
+    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
+    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
+    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_w(plh, pll);
+    __m256i g0 = __lasx_xvilvh_w(plh, pll);
+    __m256i r0 = __lasx_xvilvl_w(phh, phl);
+    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+    d = v_uint32x8(a0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
+{
+    __m256i bgra0 = __lasx_xvld(ptr, 0);
+    __m256i bgra1 = __lasx_xvld(ptr + 4, 0);
+    __m256i bgra2 = __lasx_xvld(ptr + 8, 0);
+    __m256i bgra3 = __lasx_xvld(ptr + 12, 0);
+
+    __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
+    __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
+    __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
+    __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_d(l13, l02);
+    __m256i g0 = __lasx_xvilvh_d(l13, l02);
+    __m256i r0 = __lasx_xvilvl_d(h13, h02);
+    __m256i a0 = __lasx_xvilvh_d(h13, h02);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+    d = v_uint64x4(a0);
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_b(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_b(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_h(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_h(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_w(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_w(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_d(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_d(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m256i sh_g = _v256_setr_b(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m256i sh_r = _v256_setr_b(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 64);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m256i sh_r = _v256_setr_b(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p1,   (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
+    __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
+    __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
+
+    __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p2,   (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
+    __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
+
+    __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
+    __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+                                const v_uint8x32& c, const v_uint8x32& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 64);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 96);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+                                const v_uint16x16& c, const v_uint16x16& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+                                const v_uint32x8& c, const v_uint32x8& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+                                const v_uint64x4& c, const v_uint64x4& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
+
+    __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
+    __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
+    __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
+    __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
+    __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \
+}
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x8 v256_load_expand(const float16_t* ptr)
+{
+#if CV_FP16
+    //1-load128, 2-permi, 3-cvt
+   return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10)));
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    for (int i = 0; i < 8; i++)
+        buf[i] = (float)ptr[i];
+    return v256_load_aligned(buf);
+#endif
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+{
+#if CV_FP16
+    __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
+    __lsx_vst((_m128i)ah, ptr, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 8; i++)
+        ptr[i] = float16_t(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v256_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LASX_HPP
diff --git a/modules/core/src/parallel_impl.cpp b/modules/core/src/parallel_impl.cpp
index c118bcd3cb..708627c072 100644
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
 // https://github.com/riscv/riscv-isa-manual/issues/43
 // #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0)
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
+# elif defined __GNUC__ && defined __loongarch__
+#   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
 # else
 #   warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags."
 #   define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0)
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 4e7e1b7ea0..027072a5da 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -434,6 +434,8 @@ struct HWFeatures
         g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
 
         g_hwFeatureNames[CPU_RVV] = "RVV";
+
+        g_hwFeatureNames[CPU_LASX] = "LASX";
     }
 
     void initialize(void)
@@ -689,6 +691,10 @@ struct HWFeatures
         have[CV_CPU_RVV] = true;
     #endif
 
+    #if defined __loongarch_asx
+        have[CV_CPU_LASX] = true;
+    #endif
+
         bool skip_baseline_check = false;
 #ifndef NO_GETENV
         if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK"))
diff --git a/modules/core/test/test_hal_core.cpp b/modules/core/test/test_hal_core.cpp
index 35fb977478..f9078e55f9 100644
--- a/modules/core/test/test_hal_core.cpp
+++ b/modules/core/test/test_hal_core.cpp
@@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp)
         int size = (hcase / 2) % 4;
         size = size == 0 ? 3 : size == 1 ? 4  : size == 2 ? 6 : 15;
         int nfunc = (hcase / 8);
+    #if CV_LASX
+        double eps = depth == CV_32F ? 1e-5 : 2e-10;
+    #else
         double eps = depth == CV_32F ? 1e-5 : 1e-10;
+    #endif
 
         if( size == 3 )
             return; // TODO ???
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index e0773d5214..08cda81819 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -8,8 +8,8 @@ endif()
 
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
-ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
-ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
 
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
 
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index dfa58b09fe..320a18e5ab 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -579,13 +579,14 @@ public:
         bool is1x1_;
         bool useAVX2;
         bool useAVX512;
+        bool useLASX;
         int blk_size_cn;
         int inpZp, outZp;
         const std::vector<float>* multiplier;
 
         ParallelConv()
             : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
             , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
         {}
 
@@ -641,6 +642,8 @@ public:
             p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
 
+            p.useLASX   = checkHardwareSupport(CPU_LASX) && isConv2D;
+
             int kernel_d = isConv3D? kernel_size[0] : 1;
             int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
             int kernel_w = kernel_size.back();
@@ -837,6 +840,13 @@ public:
                                     stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                     biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                             else
+                        #endif
+                        #if CV_TRY_LASX
+                            if(useLASX)
+                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
                         #endif
                             {
                                 const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -1210,6 +1220,12 @@ public:
                             opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                           outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                         else
+                    #endif
+                    #if CV_TRY_LASX
+                        if(useLASX)
+                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
                     #endif
                         for( int i = 0; i < outCn; i += 2 )
                         {
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index dc759ebdbc..867f002dd4 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -226,7 +226,7 @@ public:
     {
     public:
         FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
-                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
 
         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
                         const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@@ -250,6 +250,7 @@ public:
             p.activ = !activationLUT.empty() ? activ : 0;
             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+            p.useLASX = checkHardwareSupport(CPU_LASX);
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
@@ -294,6 +295,11 @@ public:
                 if( useAVX2 )
                     opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                 else
+            #endif
+            #if CV_TRY_LASX
+                if( useLASX )
+                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
             #endif
                 {
                     int i = 0;
@@ -349,6 +355,7 @@ public:
         int nstripes, outZp;
         bool useAVX2;
         bool useAVX512;
+        bool useLASX;
     };
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp
index bf6149e5c9..1b3ac7a4b8 100644
--- a/modules/dnn/src/int8layers/layers_common.simd.hpp
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
 }
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
+
+inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_d(0);
+    __m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b);
+    __m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b);
+
+    __m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16);
+    __m256i  odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16);
+
+    return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c);
+}
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const int8_t* wptr0 = weights + i*wstep;
+        const int8_t* wptr1 = wptr0 + wstep;
+        const int8_t* wptr2 = wptr1 + wstep;
+        int* outptr0 = output + i*outPlaneSize;
+        int* outptr1 = outptr0 + outPlaneSize;
+        int* outptr2 = outptr1 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            mult2 = mult1;
+
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+                mult2 = mult1 = mult0;
+            }
+        }
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const int8_t* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0),
+                    vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0),
+                    vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0),
+                    vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0),
+                    vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0),
+                    vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0);
+
+            for (; k < vecsize; k += 32, rptr += 32 )
+            {
+                __m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0);
+                __m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0);
+                __m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0);
+                __m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0);
+
+                vs00 = _v256_fmadds8_s32(w0, r0, vs00);
+                vs10 = _v256_fmadds8_s32(w1, r0, vs10);
+                vs20 = _v256_fmadds8_s32(w2, r0, vs20);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0);
+                vs01 = _v256_fmadds8_s32(w0, r0, vs01);
+                vs11 = _v256_fmadds8_s32(w1, r0, vs11);
+                vs21 = _v256_fmadds8_s32(w2, r0, vs21);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0);
+                vs02 = _v256_fmadds8_s32(w0, r0, vs02);
+                vs12 = _v256_fmadds8_s32(w1, r0, vs12);
+                vs22 = _v256_fmadds8_s32(w2, r0, vs22);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0);
+                vs03 = _v256_fmadds8_s32(w0, r0, vs03);
+                vs13 = _v256_fmadds8_s32(w1, r0, vs13);
+                vs23 = _v256_fmadds8_s32(w2, r0, vs23);
+            }
+
+            /*t0*/
+            __m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00);
+            __m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w);
+
+            __m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01);
+            __m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w);
+
+            __m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02);
+            __m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w);
+
+            __m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03);
+            __m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w);
+
+            __m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d);
+            __m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d);
+            __m256i        t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+            /*t1*/
+            __m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10);
+            __m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w);
+
+            __m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11);
+            __m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w);
+
+            __m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12);
+            __m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w);
+
+            __m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13);
+            __m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w);
+
+            __m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d);
+            __m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d);
+            __m256i        t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+            /*t2*/
+            __m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20);
+            __m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w);
+
+            __m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21);
+            __m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w);
+
+            __m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22);
+            __m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w);
+
+            __m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23);
+            __m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w);
+
+            __m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d);
+            __m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d);
+            __m256i        t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20);
+
+            t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1));
+            t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1));
+            t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1));
+
+            __m128i s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = __lsx_vreplgr2vr_w(bias0);
+                s1 = __lsx_vreplgr2vr_w(bias1);
+                s2 = __lsx_vreplgr2vr_w(bias2);
+            }
+            else
+            {
+                s0 = __lsx_vld((__m128i*)(outptr0 + j), 0);
+                s1 = __lsx_vld((__m128i*)(outptr1 + j), 0);
+                s2 = __lsx_vld((__m128i*)(outptr2 + j), 0);
+            }
+
+            s0 = __lsx_vadd_w(s0, *(__m128i*)&t0);
+            s1 = __lsx_vadd_w(s1, *(__m128i*)&t1);
+            s2 = __lsx_vadd_w(s2, *(__m128i*)&t2);
+
+            if( finalOutput )
+            {
+                __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
+                __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
+                __m256 v_mult0 = _v256_setall_ps(mult0);
+                __m256 v_mult1 = _v256_setall_ps(mult1);
+                __m256 v_mult2 = _v256_setall_ps(mult2);
+
+                s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0)));
+                s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1)));
+                s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2)));
+
+                s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax);
+                s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax);
+                s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax);
+            }
+            if( tail )
+            {
+                s0 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0),  s0, mask);
+                s1 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0),  s1, mask);
+                s2 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0),  s2, mask);
+            }
+            __lsx_vst(s0, (__m128i*)(outptr0 + j), 0);
+            __lsx_vst(s1, (__m128i*)(outptr1 + j), 0);
+            __lsx_vst(s2, (__m128i*)(outptr2 + j), 0);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            int s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+                r = rptr1[k];
+                s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+                s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            int s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+}
+
+static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b,
+                                         __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
+{
+    __m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0);
+    __m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0);
+
+    __m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0);
+    __m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0);
+
+    __m256i a0b0 = __lasx_xvmul_h(a0, b0);
+    __m256i a1b1 = __lasx_xvmul_h(a1, b1);
+
+    out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0));
+    out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0));
+    out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0));
+    out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0));
+}
+
+static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
+{
+    __m256i t0 = __lasx_xvld((const __m256i*)ptr, 0);
+    __m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1);
+
+    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = __lasx_xvshuf_b(t0, t0, sh);
+    __m256i p1 = __lasx_xvshuf_b(t1, t1, sh);
+    __m256i lo = __lasx_xvpermi_q(p0, p1, 0x02);
+    __m256i hi = __lasx_xvpermi_q(p0, p1, 0x13);
+
+    a = __lasx_xvilvl_d(hi, lo);
+    b = __lasx_xvilvh_d(hi, lo);
+}
+
+void fastDepthwiseConv( const int8_t* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const int* biasptr, const float* multptr,
+                     const int8_t* inptr_,
+                     int height, int width,
+                     int* outptr_,
+                     int out_d, int outH, int outW,
+                     int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float mult = multptr[out_d];
+    int bias = biasptr[out_d];
+    int biasCopy;
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t* imgptr0 = inptr_ + in_i*width;
+        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                  biasCopy + inpZp*(w00 + w10 + w20);
+            outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 32;
+            __m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02),
+                    vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12),
+                    vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22);
+            __m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp),
+                    outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127);
+            __m256 vmult = _v256_setall_ps(mult);
+            __m256i vout0, vout1, vout2, vout3;
+
+            if( stride_w == 1 )
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0),
+                            v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0),
+                            v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0),
+                            v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0),
+                            v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0),
+                            v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0),
+                            v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0),
+                            v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0),
+                            v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
+                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
+                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
+                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
+
+                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
+                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
+                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
+                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
+
+                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
+                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
+                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
+                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
+                }
+            }
+            else
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
+                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
+                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
+                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
+
+                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
+                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
+                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
+                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
+
+                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
+                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
+                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
+                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
+                }
+            }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp*(w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp*(w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp*(w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0),
+                vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0),
+                vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0),
+                vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0);
+
+        __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
+        __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
+
+            vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
+            vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1);
+            vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2);
+            vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3);
+            vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4);
+            vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5);
+            vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6);
+            vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7);
+        }
+
+        /*s0*/
+        __m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
+        __m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w);
+
+        __m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1);
+        __m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w);
+
+        __m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2);
+        __m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w);
+
+        __m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3);
+        __m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w);
+
+        __m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d);
+        __m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d);
+        __m256i      s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0);
+
+        /*s1*/
+        __m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4);
+        __m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w);
+
+        __m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5);
+        __m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w);
+
+        __m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6);
+        __m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w);
+
+        __m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7);
+        __m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w);
+
+        __m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d);
+        __m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d);
+        __m256i      s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4);
+
+        s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1));
+        s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1));
+
+        __m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0));
+        __m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4));
+
+        t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0))));
+        t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4))));
+
+        t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax);
+        t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax);
+
+        __lsx_vst(t0, (__m128i*)(dst + i), 0);
+        __lsx_vst(t1, (__m128i*)(dst + i), 4*4);
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = __lasx_xvreplgr2vr_d(0);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
+                  vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
+        }
+
+        __m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
+        int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3];
+        dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
+    }
+
+}
+#endif // CV_LASX
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 7bb3014fc0..678a052c7c 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -986,12 +986,13 @@ public:
         bool useAVX2;
         bool useAVX512;
         bool useRVV;
+        bool useLASX;
         int blk_size_cn;
 
         ParallelConv()
             : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
               biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
-            , blk_size_cn(0)
+            , useLASX(false), blk_size_cn(0)
         {}
 
         static void run( const Mat& input, Mat& output, const Mat& weights,
@@ -1049,6 +1050,7 @@ public:
             p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
             p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
+            p.useLASX  = checkHardwareSupport(CPU_LASX) && isConv2D;
 
             int kernel_d = isConv3D? kernel_size[0] : 1;
             int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@@ -1256,6 +1258,13 @@ public:
                                     stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                     biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
                             else
+                        #endif
+                        #if CV_TRY_LASX
+                            if(useLASX)
+                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
+                            else
                         #endif
                             {
                                 const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -1631,6 +1640,12 @@ public:
                             opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                         else
+                    #endif
+                    #if CV_TRY_LASX
+                        if(useLASX)
+                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
+                        else
                     #endif
                         for( int i = 0; i < outCn; i += 2 )
                         {
@@ -2437,6 +2452,7 @@ public:
             useAVX2 = checkHardwareSupport(CPU_AVX2);
             useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
             useRVV = checkHardwareSupport(CPU_RVV);
+            useLASX = checkHardwareSupport(CPU_LASX);
         }
 
         void operator()(const Range& range_) const CV_OVERRIDE
@@ -2474,6 +2490,11 @@ public:
                 opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
             }
             else
+        #endif
+        #if CV_TRY_LASX
+            if( useLASX )
+                opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+            else
         #endif
             for( m = 0; m < mmax; m += 2 )
             {
@@ -2574,6 +2595,7 @@ public:
         bool useAVX2;
         bool useAVX512;
         bool useRVV;
+        bool useLASX;
     };
 
     class Col2ImInvoker : public cv::ParallelLoopBody
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 509f6cc177..71ca706ac4 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -173,7 +173,7 @@ public:
     class FullyConnected : public ParallelLoopBody
     {
     public:
-        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {}
 
         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
                         Mat& dstMat, const ActivationLayer* activ, int nstripes)
@@ -197,6 +197,7 @@ public:
             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
             p.useRVV = checkHardwareSupport(CPU_RVV);
+            p.useLASX = checkHardwareSupport(CPU_LASX);
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
@@ -250,6 +251,11 @@ public:
                 if( useRVV )
                     opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                 else
+            #endif
+            #if CV_TRY_LASX
+                if( useLASX )
+                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
             #endif
                 {
                     int i = 0;
@@ -305,6 +311,7 @@ public:
         bool useAVX2;
         bool useAVX512;
         bool useRVV;
+        bool useLASX;
     };
 
 #ifdef HAVE_OPENCL
diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp
index fd88a3c3d2..f706abfa76 100644
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr,
 
 #endif // CV_RVV
 
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
+    __m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f);
+    __m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2;
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+
+        if( relu )
+        {
+            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
+            if( i+2 >= outCn )
+            {
+                r2 = r1;
+                if( i+1 >= outCn )
+                    r2 = r1 = r0;
+            }
+            vr0 = _v256_extract_low(_v256_setall_ps(r0));
+            vr1 = _v256_extract_low(_v256_setall_ps(r1));
+            vr2 = _v256_extract_low(_v256_setall_ps(r2));
+        }
+
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const float* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i tmp;
+            __m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp);
+
+            for (; k < vecsize; k += 8, rptr += 8 )
+            {
+                __m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0);
+                __m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0);
+                __m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0);
+                __m256 r0 = (__m256)__lasx_xvld(rptr, 0);
+
+                vs00 = __lasx_xvfmadd_s(w0, r0, vs00);
+                vs10 = __lasx_xvfmadd_s(w1, r0, vs10);
+                vs20 = __lasx_xvfmadd_s(w2, r0, vs20);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0);
+                vs01 = __lasx_xvfmadd_s(w0, r0, vs01);
+                vs11 = __lasx_xvfmadd_s(w1, r0, vs11);
+                vs21 = __lasx_xvfmadd_s(w2, r0, vs21);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0);
+                vs02 = __lasx_xvfmadd_s(w0, r0, vs02);
+                vs12 = __lasx_xvfmadd_s(w1, r0, vs12);
+                vs22 = __lasx_xvfmadd_s(w2, r0, vs22);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0);
+                vs03 = __lasx_xvfmadd_s(w0, r0, vs03);
+                vs13 = __lasx_xvfmadd_s(w1, r0, vs13);
+                vs23 = __lasx_xvfmadd_s(w2, r0, vs23);
+            }
+
+            /*t0*/
+            __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm);
+            __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
+            __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
+
+            __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm);
+            __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
+            __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
+
+            __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm);
+            __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
+            __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
+
+            __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm);
+            __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
+            __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
+
+            __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
+            __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
+            __m256         t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+            /*t1*/
+            __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm);
+            __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
+            __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
+
+            __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm);
+            __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
+            __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
+
+            __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm);
+            __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
+            __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
+
+            __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm);
+            __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
+            __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
+
+            __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
+            __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
+            __m256         t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+            /*t2*/
+            __m256  vs20_perm   = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm);
+            __m256  tmp20_srl   = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32);
+            __m256  vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl);
+
+            __m256  vs21_perm   = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm);
+            __m256  tmp21_srl   = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32);
+            __m256  vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl);
+
+            __m256  vs22_perm   = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm);
+            __m256  tmp22_srl   = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32);
+            __m256  vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl);
+
+            __m256  vs23_perm   = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm);
+            __m256  tmp23_srl   = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32);
+            __m256  vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl);
+
+            __m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w);
+            __m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w);
+            __m256         t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20);
+
+            t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1));
+            t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1));
+            t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1));
+
+            __m128 s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _v256_extract_low(_v256_setall_ps(bias0));
+                s1 = _v256_extract_low(_v256_setall_ps(bias1));
+                s2 = _v256_extract_low(_v256_setall_ps(bias2));
+            }
+            else
+            {
+                s0 = (__m128)__lsx_vld(outptr0 + j, 0);
+                s1 = (__m128)__lsx_vld(outptr1 + j, 0);
+                s2 = (__m128)__lsx_vld(outptr2 + j, 0);
+            }
+
+            s0 = __lsx_vfadd_s(s0, *(__m128*)&t0);
+            s1 = __lsx_vfadd_s(s1, *(__m128*)&t1);
+            s2 = __lsx_vfadd_s(s2, *(__m128*)&t2);
+
+            if( relu )
+            {
+                __m128i m0 = __lsx_vfcmp_clt_s(z, s0);
+                __m128i m1 = __lsx_vfcmp_clt_s(z, s1);
+                __m128i m2 = __lsx_vfcmp_clt_s(z, s2);
+                s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0);
+                s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1);
+                s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2);
+            }
+
+            if( tail )
+            {
+                s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask);
+                s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask);
+                s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask);
+            }
+
+            __lsx_vst(s0, outptr0 + j, 0);
+            __lsx_vst(s1, outptr1 + j, 0);
+            __lsx_vst(s2, outptr2 + j, 0);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
+            const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            float s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
+                r = rptr1[k];
+                s01 += w0*r; s11 += w1*r; s21 += w2*r;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s01 = s01 > 0.f ? s01 : s01*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s11 = s11 > 0.f ? s11 : s11*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+                s21 = s21 > 0.f ? s21 : s21*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
+            float s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+}
+
+static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
+{
+    __m256 t0 = (__m256)__lasx_xvld(ptr, 0);
+    __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
+
+    __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
+    __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
+
+    a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
+    b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
+                   vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
+                   vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
+            __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
+            vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
+                           v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
+                           v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
+                           v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
+                           v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
+                           v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
+                           v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
+                           v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
+                           v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+    __m256i v256_tmp;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
+
+            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
+            vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1);
+            vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2);
+            vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3);
+            vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4);
+            vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5);
+            vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6);
+            vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7);
+        }
+
+        /*s0*/
+        __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm);
+        __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
+        __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
+
+        __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm);
+        __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
+        __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
+
+        __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm);
+        __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
+        __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
+
+        __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm);
+        __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
+        __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
+
+        __m256i  vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
+        __m256i  vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
+        __m256          s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+        /*s1*/
+        __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm);
+        __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
+        __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
+
+        __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm);
+        __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
+        __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
+
+        __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm);
+        __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
+        __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
+
+        __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm);
+        __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
+        __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
+
+        __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
+        __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
+        __m256         s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1));
+        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1));
+
+        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0));
+        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4));
+
+        __lsx_vst(*(__m128*)&s0, dst + i, 0);
+        __lsx_vst(*(__m128*)&s1, dst + i, 4*4);
+    }
+
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
+            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
+        }
+
+        __m256i vs0_perm   = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm);
+        __m256i tmp_srl    = __lasx_xvsrli_d(vs0_add_2w, 32);
+        __m256  vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl);
+        temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4];
+        dst[i] = temp + bias[i];
+    }
+}
+
+
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb )
+{
+    int n = 0;
+
+    for( ; n <= nb - 16; n += 16 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            __m256i v256_tmp;
+            __m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+            for( int k = 0; k < na; k++ )
+            {
+                __m256 a0 = _v256_setall_ps(aptr0[k]);
+                __m256 a1 = _v256_setall_ps(aptr1[k]);
+                __m256 a2 = _v256_setall_ps(aptr2[k]);
+                __m256 a3 = _v256_setall_ps(aptr3[k]);
+
+                __m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0);
+                __m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0);
+                d00 = __lasx_xvfmadd_s(a0, b0, d00);
+                d01 = __lasx_xvfmadd_s(a0, b1, d01);
+                d10 = __lasx_xvfmadd_s(a1, b0, d10);
+                d11 = __lasx_xvfmadd_s(a1, b1, d11);
+                d20 = __lasx_xvfmadd_s(a2, b0, d20);
+                d21 = __lasx_xvfmadd_s(a2, b1, d21);
+                d30 = __lasx_xvfmadd_s(a3, b0, d30);
+                d31 = __lasx_xvfmadd_s(a3, b1, d31);
+            }
+
+            __lasx_xvst(d00, cptr0 + n, 0);
+            __lasx_xvst(d01, cptr0 + n, 8*4);
+            __lasx_xvst(d10, cptr1 + n, 0);
+            __lasx_xvst(d11, cptr1 + n, 8*4);
+            __lasx_xvst(d20, cptr2 + n, 0);
+            __lasx_xvst(d21, cptr2 + n, 8*4);
+            __lasx_xvst(d30, cptr3 + n, 0);
+            __lasx_xvst(d31, cptr3 + n, 8*4);
+        }
+    }
+
+    for( ; n < nb; n++ )
+    {
+        for( int m = 0; m < ma; m++ )
+        {
+            const float* aptr0 = aptr + astep*m;
+            float* cptr0 = cptr + cstep*m;
+            float d0 = 0.f;
+
+            for( int k = 0; k < na; k++ )
+                d0 += aptr0[k]*bptr[k*bstep + n];
+
+            cptr0[n] = d0;
+        }
+    }
+}
+
+#endif // CV_LASX
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index f57d05faee..c6dd063f73 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2178,6 +2178,9 @@ public:
     #if CV_TRY_SSE4_1
         bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
     #endif
+    #if CV_TRY_LASX
+        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
+    #endif
 
         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@@ -2241,6 +2244,10 @@ public:
                         if ( useAVX2 )
                             x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
                         #endif
+                        #if CV_TRY_LASX
+                        if ( useLASX )
+                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
+                        #endif
                         #if CV_SIMD128
                         {
                             v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp
index 1f8f1c5d17..4b81b5e79d 100644
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
 #endif
 }
 
+namespace opt_LASX
+{
+#if CV_TRY_LASX
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+#endif
+}
+
 namespace opt_SSE4_1
 {
 #if CV_TRY_SSE4_1
diff --git a/modules/imgproc/src/imgwarp.lasx.cpp b/modules/imgproc/src/imgwarp.lasx.cpp
new file mode 100644
index 0000000000..f6bf2a13cb
--- /dev/null
+++ b/modules/imgproc/src/imgwarp.lasx.cpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "imgwarp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv
+{
+namespace opt_LASX
+{
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    __m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1);
+    __m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0);
+    for (; x1 <= bw - 16; x1 += 16)
+    {
+        __m256i tx0, tx1, ty0, ty1;
+        tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX);
+        ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY);
+        tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX);
+        ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY);
+
+        tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS);
+        ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS);
+        tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS);
+        ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS);
+
+        __m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask),
+            __lasx_xvand_v(tx1, fxy_mask));
+        __m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask),
+            __lasx_xvand_v(ty1, fxy_mask));
+        tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS),
+            __lasx_xvsrai_w(tx1, INTER_BITS));
+        ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS),
+            __lasx_xvsrai_w(ty1, INTER_BITS));
+        fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS));
+        fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
+
+        __lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0);
+        __lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2);
+        __lasx_xvst(fx_, (__m256i*)(alpha + x1), 0);
+    }
+    return x1;
+}
+
+}
+}
+/* End of file. */
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 90a05085e3..4ad64534be 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
             opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
     }
     else
+#endif
+#if CV_TRY_LASX
+    if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4)))
+    {
+        if(pix_size == 2)
+            opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify);
+        else
+            opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify);
+    }
+    else
 #endif
     {
         resizeNNInvoker invoker(src, dst, x_ofs, ify);
diff --git a/modules/imgproc/src/resize.hpp b/modules/imgproc/src/resize.hpp
index 67cf5184af..1636e7585e 100644
--- a/modules/imgproc/src/resize.hpp
+++ b/modules/imgproc/src/resize.hpp
@@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double);
 int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width);
 #endif
 }
+
+namespace opt_LASX
+{
+#if CV_TRY_LASX
+void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double);
+void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double);
+#endif
+}
+
 }
 #endif
 /* End of file. */
diff --git a/modules/imgproc/src/resize.lasx.cpp b/modules/imgproc/src/resize.lasx.cpp
new file mode 100644
index 0000000000..fece47087d
--- /dev/null
+++ b/modules/imgproc/src/resize.lasx.cpp
@@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "resize.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv
+{
+namespace opt_LASX
+{
+
+class resizeNNInvokerLASX4 CV_FINAL :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
+        ify(_ify)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int avxWidth = width - (width & 0x7);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
+                    __lasx_xvst(pixels, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
+                    __lasx_xvst(pixels, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+    }
+
+private:
+    const Mat& src;
+    Mat& dst;
+    int* x_ofs;
+    double ify;
+
+    resizeNNInvokerLASX4(const resizeNNInvokerLASX4&);
+    resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&);
+};
+
+class resizeNNInvokerLASX2 CV_FINAL :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
+        ify(_ify)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int avxWidth = width - (width & 0xf);
+        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
+                                                                     15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
+        const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
+
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
+
+                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
+                    __lasx_xvst(ints_permuted, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
+
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
+
+                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
+                    __lasx_xvst(ints_permuted, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+            }
+        }
+    }
+
+private:
+    const Mat& src;
+    Mat& dst;
+    int* x_ofs;
+    double ify;
+
+    resizeNNInvokerLASX2(const resizeNNInvokerLASX2&);
+    resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&);
+};
+
+void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
+{
+    resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
+{
+    resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+}
+}
+/* End of file. */