Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX

* Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX * Add resize.lasx.cpp for Loongson SIMD acceleration * Add imgwarp.lasx.cpp for Loongson SIMD acceleration * Add LASX acceleration support for dnn/conv * Add CV_PAUSE(v) for Loongarch * Set LASX by default on Loongarch64 * LoongArch: tune test threshold for Core/HAL.mat_decomp/15 Co-authored-by: shengwenxue <shengwenxue@loongson.cn>
2 years ago · 4154bd0667
parent 866191478f
commit 4154bd0667
24 changed files with 5071 additions and 6 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
+list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)

 ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@ -380,6 +381,12 @@ elseif(RISCV)
  set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}")
  set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}")

+elseif(LOONGARCH64)
+  ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
+  ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
+  set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
+
 endif()

 # Helper values for cmake-gui
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
  set(MIPS 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)")
  set(RISCV 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)")
+  set(LOONGARCH64 1)
 else()
  if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR)
    message(WARNING "OpenCV: unrecognized target processor configuration")
--- a/cmake/checks/cpu_lasx.cpp
+++ b/cmake/checks/cpu_lasx.cpp
@ -0,0 +1,23 @@
+#include <stdio.h>
+
+#if defined(__loongarch_asx)
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
+#if defined CV_LASX
+int test()
+{
+    const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
+    v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0);
+    return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7);
+}
+#else
+#error "LASX is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -172,6 +172,11 @@
 #  define CV_MSA 1
 #endif

+#ifdef CV_CPU_COMPILE_LASX
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
 #ifdef __EMSCRIPTEN__
 #  define CV_WASM_SIMD 1
 #  include <wasm_simd128.h>
@ -370,3 +375,7 @@ struct VZeroUpperGuard {
 #ifndef CV_RVV
 #  define CV_RVV 0
 #endif
+
+#ifndef CV_LASX
+#  define CV_LASX 0
+#endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -525,5 +525,26 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 1
+#  define CV_CPU_HAS_SUPPORT_LASX 1
+#  define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
+#  define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#else
+#  define CV_TRY_LASX 0
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX 0
+#  define CV_CPU_CALL_LASX(fn, args)
+#  define CV_CPU_CALL_LASX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...)  CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -279,6 +279,8 @@ namespace cv {

 #define CV_CPU_RVV              210

+#define CV_CPU_LASX             230
+
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
 #define CV_CPU_AVX512_COMMON    257
@ -336,6 +338,8 @@ enum CpuFeatures {

    CPU_RVV             = 210,

+    CPU_LASX             = 230,
+
    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
    CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;

 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
 #include "opencv2/core/hal/intrin_rvv.hpp"
+
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
 #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
+
+#elif CV_LASX
+    #if !defined(CV_FORCE_SIMD128_CPP)
+    #define CV_FORCE_SIMD128_CPP 1
+    #endif
+#include "opencv2/core/hal/intrin_cpp.hpp"
+
 #else

 #include "opencv2/core/hal/intrin_cpp.hpp"
@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;

 #endif

+#if CV_LASX
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_lasx.hpp"
+
+#endif
+
 //! @cond IGNORED

 namespace cv {
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
 // https://github.com/riscv/riscv-isa-manual/issues/43
 // #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0)
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
+# elif defined __GNUC__ && defined __loongarch__
+#   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
 # else
 #   warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags."
 #   define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0)
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -434,6 +434,8 @@ struct HWFeatures
        g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";

        g_hwFeatureNames[CPU_RVV] = "RVV";
+
+        g_hwFeatureNames[CPU_LASX] = "LASX";
    }

    void initialize(void)
@ -689,6 +691,10 @@ struct HWFeatures
        have[CV_CPU_RVV] = true;
    #endif

+    #if defined __loongarch_asx
+        have[CV_CPU_LASX] = true;
+    #endif
+
        bool skip_baseline_check = false;
 #ifndef NO_GETENV
        if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK"))
--- a/modules/core/test/test_hal_core.cpp
+++ b/modules/core/test/test_hal_core.cpp
@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp)
        int size = (hcase / 2) % 4;
        size = size == 0 ? 3 : size == 1 ? 4  : size == 2 ? 6 : 15;
        int nfunc = (hcase / 8);
+    #if CV_LASX
+        double eps = depth == CV_32F ? 1e-5 : 2e-10;
+    #else
        double eps = depth == CV_32F ? 1e-5 : 1e-10;
+    #endif

        if( size == 3 )
            return; // TODO ???
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -8,8 +8,8 @@ endif()

 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")

-ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
-ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)

 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)

--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@ -579,13 +579,14 @@ public:
        bool is1x1_;
        bool useAVX2;
        bool useAVX512;
+        bool useLASX;
        int blk_size_cn;
        int inpZp, outZp;
        const std::vector<float>* multiplier;

        ParallelConv()
            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
            , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
        {}

@ -641,6 +642,8 @@ public:
            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;

+            p.useLASX   = checkHardwareSupport(CPU_LASX) && isConv2D;
+
            int kernel_d = isConv3D? kernel_size[0] : 1;
            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
            int kernel_w = kernel_size.back();
@ -837,6 +840,13 @@ public:
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                            else
+                        #endif
+                        #if CV_TRY_LASX
+                            if(useLASX)
+                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
                        #endif
                            {
                                const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1210,6 +1220,12 @@ public:
                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                        else
+                    #endif
+                    #if CV_TRY_LASX
+                        if(useLASX)
+                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
                    #endif
                        for( int i = 0; i < outCn; i += 2 )
                        {
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@ -226,7 +226,7 @@ public:
    {
    public:
        FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
-                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}

        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
                        const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@ -250,6 +250,7 @@ public:
            p.activ = !activationLUT.empty() ? activ : 0;
            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+            p.useLASX = checkHardwareSupport(CPU_LASX);

            parallel_for_(Range(0, nstripes), p, nstripes);
        }
@ -294,6 +295,11 @@ public:
                if( useAVX2 )
                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                else
+            #endif
+            #if CV_TRY_LASX
+                if( useLASX )
+                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
            #endif
                {
                    int i = 0;
@ -349,6 +355,7 @@ public:
        int nstripes, outZp;
        bool useAVX2;
        bool useAVX512;
+        bool useLASX;
    };

    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
--- a/modules/dnn/src/int8layers/layers_common.simd.hpp
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
 }
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
+
+inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_d(0);
+    __m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b);
+    __m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b);
+
+    __m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16);
+    __m256i  odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16);
+
+    return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c);
+}
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const int8_t* wptr0 = weights + i*wstep;
+        const int8_t* wptr1 = wptr0 + wstep;
+        const int8_t* wptr2 = wptr1 + wstep;
+        int* outptr0 = output + i*outPlaneSize;
+        int* outptr1 = outptr0 + outPlaneSize;
+        int* outptr2 = outptr1 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            mult2 = mult1;
+
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+                mult2 = mult1 = mult0;
+            }
+        }
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const int8_t* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0),
+                    vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0),
+                    vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0),
+                    vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0),
+                    vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0),
+                    vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0);
+
+            for (; k < vecsize; k += 32, rptr += 32 )
+            {
+                __m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0);
+                __m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0);
+                __m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0);
+                __m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0);
+
+                vs00 = _v256_fmadds8_s32(w0, r0, vs00);
+                vs10 = _v256_fmadds8_s32(w1, r0, vs10);
+                vs20 = _v256_fmadds8_s32(w2, r0, vs20);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0);
+                vs01 = _v256_fmadds8_s32(w0, r0, vs01);
+                vs11 = _v256_fmadds8_s32(w1, r0, vs11);
+                vs21 = _v256_fmadds8_s32(w2, r0, vs21);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0);
+                vs02 = _v256_fmadds8_s32(w0, r0, vs02);
+                vs12 = _v256_fmadds8_s32(w1, r0, vs12);
+                vs22 = _v256_fmadds8_s32(w2, r0, vs22);
+
+                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0);
+                vs03 = _v256_fmadds8_s32(w0, r0, vs03);
+                vs13 = _v256_fmadds8_s32(w1, r0, vs13);
+                vs23 = _v256_fmadds8_s32(w2, r0, vs23);
+            }
+
+            /*t0*/
+            __m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00);
+            __m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w);
+
+            __m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01);
+            __m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w);
+
+            __m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02);
+            __m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w);
+
+            __m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03);
+            __m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w);
+
+            __m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d);
+            __m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d);
+            __m256i        t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+            /*t1*/
+            __m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10);
+            __m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w);
+
+            __m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11);
+            __m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w);
+
+            __m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12);
+            __m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w);
+
+            __m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13);
+            __m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w);
+
+            __m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d);
+            __m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d);
+            __m256i        t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+            /*t2*/
+            __m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20);
+            __m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w);
+
+            __m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21);
+            __m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w);
+
+            __m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22);
+            __m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w);
+
+            __m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23);
+            __m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w);
+
+            __m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d);
+            __m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d);
+            __m256i        t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20);
+
+            t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1));
+            t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1));
+            t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1));
+
+            __m128i s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = __lsx_vreplgr2vr_w(bias0);
+                s1 = __lsx_vreplgr2vr_w(bias1);
+                s2 = __lsx_vreplgr2vr_w(bias2);
+            }
+            else
+            {
+                s0 = __lsx_vld((__m128i*)(outptr0 + j), 0);
+                s1 = __lsx_vld((__m128i*)(outptr1 + j), 0);
+                s2 = __lsx_vld((__m128i*)(outptr2 + j), 0);
+            }
+
+            s0 = __lsx_vadd_w(s0, *(__m128i*)&t0);
+            s1 = __lsx_vadd_w(s1, *(__m128i*)&t1);
+            s2 = __lsx_vadd_w(s2, *(__m128i*)&t2);
+
+            if( finalOutput )
+            {
+                __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
+                __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
+                __m256 v_mult0 = _v256_setall_ps(mult0);
+                __m256 v_mult1 = _v256_setall_ps(mult1);
+                __m256 v_mult2 = _v256_setall_ps(mult2);
+
+                s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0)));
+                s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1)));
+                s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2)));
+
+                s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax);
+                s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax);
+                s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax);
+            }
+            if( tail )
+            {
+                s0 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0),  s0, mask);
+                s1 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0),  s1, mask);
+                s2 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0),  s2, mask);
+            }
+            __lsx_vst(s0, (__m128i*)(outptr0 + j), 0);
+            __lsx_vst(s1, (__m128i*)(outptr1 + j), 0);
+            __lsx_vst(s2, (__m128i*)(outptr2 + j), 0);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            int s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+                r = rptr1[k];
+                s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+                s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            int s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+}
+
+static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b,
+                                         __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
+{
+    __m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0);
+    __m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0);
+
+    __m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0);
+    __m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0);
+
+    __m256i a0b0 = __lasx_xvmul_h(a0, b0);
+    __m256i a1b1 = __lasx_xvmul_h(a1, b1);
+
+    out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0));
+    out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0));
+    out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0));
+    out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0));
+}
+
+static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
+{
+    __m256i t0 = __lasx_xvld((const __m256i*)ptr, 0);
+    __m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1);
+
+    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = __lasx_xvshuf_b(t0, t0, sh);
+    __m256i p1 = __lasx_xvshuf_b(t1, t1, sh);
+    __m256i lo = __lasx_xvpermi_q(p0, p1, 0x02);
+    __m256i hi = __lasx_xvpermi_q(p0, p1, 0x13);
+
+    a = __lasx_xvilvl_d(hi, lo);
+    b = __lasx_xvilvh_d(hi, lo);
+}
+
+void fastDepthwiseConv( const int8_t* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const int* biasptr, const float* multptr,
+                     const int8_t* inptr_,
+                     int height, int width,
+                     int* outptr_,
+                     int out_d, int outH, int outW,
+                     int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float mult = multptr[out_d];
+    int bias = biasptr[out_d];
+    int biasCopy;
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t* imgptr0 = inptr_ + in_i*width;
+        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                  biasCopy + inpZp*(w00 + w10 + w20);
+            outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 32;
+            __m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02),
+                    vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12),
+                    vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22);
+            __m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp),
+                    outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127);
+            __m256 vmult = _v256_setall_ps(mult);
+            __m256i vout0, vout1, vout2, vout3;
+
+            if( stride_w == 1 )
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0),
+                            v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0),
+                            v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0),
+                            v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0),
+                            v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0),
+                            v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0),
+                            v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0),
+                            v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0),
+                            v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
+                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
+                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
+                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
+
+                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
+                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
+                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
+                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
+
+                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
+                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
+                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
+                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
+                }
+            }
+            else
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
+                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
+                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
+                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
+
+                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
+                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
+                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
+                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
+
+                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
+                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
+                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
+                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
+                }
+            }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp*(w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp*(w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp*(w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0),
+                vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0),
+                vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0),
+                vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0);
+
+        __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
+        __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
+
+            vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
+            vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1);
+            vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2);
+            vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3);
+            vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4);
+            vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5);
+            vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6);
+            vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7);
+        }
+
+        /*s0*/
+        __m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
+        __m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w);
+
+        __m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1);
+        __m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w);
+
+        __m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2);
+        __m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w);
+
+        __m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3);
+        __m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w);
+
+        __m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d);
+        __m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d);
+        __m256i      s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0);
+
+        /*s1*/
+        __m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4);
+        __m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w);
+
+        __m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5);
+        __m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w);
+
+        __m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6);
+        __m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w);
+
+        __m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7);
+        __m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w);
+
+        __m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d);
+        __m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d);
+        __m256i      s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4);
+
+        s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1));
+        s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1));
+
+        __m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0));
+        __m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4));
+
+        t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0))));
+        t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4))));
+
+        t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax);
+        t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax);
+
+        __lsx_vst(t0, (__m128i*)(dst + i), 0);
+        __lsx_vst(t1, (__m128i*)(dst + i), 4*4);
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = __lasx_xvreplgr2vr_d(0);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
+                  vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
+        }
+
+        __m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
+        int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3];
+        dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
+    }
+
+}
+#endif // CV_LASX
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -986,12 +986,13 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
+        bool useLASX;
        int blk_size_cn;

        ParallelConv()
            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
              biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
-            , blk_size_cn(0)
+            , useLASX(false), blk_size_cn(0)
        {}

        static void run( const Mat& input, Mat& output, const Mat& weights,
@ -1049,6 +1050,7 @@ public:
            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
            p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
+            p.useLASX  = checkHardwareSupport(CPU_LASX) && isConv2D;

            int kernel_d = isConv3D? kernel_size[0] : 1;
            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@ -1256,6 +1258,13 @@ public:
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
                            else
+                        #endif
+                        #if CV_TRY_LASX
+                            if(useLASX)
+                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
+                            else
                        #endif
                            {
                                const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1631,6 +1640,12 @@ public:
                            opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                         outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                        else
+                    #endif
+                    #if CV_TRY_LASX
+                        if(useLASX)
+                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
+                        else
                    #endif
                        for( int i = 0; i < outCn; i += 2 )
                        {
@ -2437,6 +2452,7 @@ public:
            useAVX2 = checkHardwareSupport(CPU_AVX2);
            useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            useRVV = checkHardwareSupport(CPU_RVV);
+            useLASX = checkHardwareSupport(CPU_LASX);
        }

        void operator()(const Range& range_) const CV_OVERRIDE
@ -2474,6 +2490,11 @@ public:
                opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
            }
            else
+        #endif
+        #if CV_TRY_LASX
+            if( useLASX )
+                opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+            else
        #endif
            for( m = 0; m < mmax; m += 2 )
            {
@ -2574,6 +2595,7 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
+        bool useLASX;
    };

    class Col2ImInvoker : public cv::ParallelLoopBody
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -173,7 +173,7 @@ public:
    class FullyConnected : public ParallelLoopBody
    {
    public:
-        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {}

        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
                        Mat& dstMat, const ActivationLayer* activ, int nstripes)
@ -197,6 +197,7 @@ public:
            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            p.useRVV = checkHardwareSupport(CPU_RVV);
+            p.useLASX = checkHardwareSupport(CPU_LASX);

            parallel_for_(Range(0, nstripes), p, nstripes);
        }
@ -250,6 +251,11 @@ public:
                if( useRVV )
                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
+            #endif
+            #if CV_TRY_LASX
+                if( useLASX )
+                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
            #endif
                {
                    int i = 0;
@ -305,6 +311,7 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
+        bool useLASX;
    };

 #ifdef HAVE_OPENCL
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr,

 #endif // CV_RVV

+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
+    __m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f);
+    __m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2;
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+
+        if( relu )
+        {
+            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
+            if( i+2 >= outCn )
+            {
+                r2 = r1;
+                if( i+1 >= outCn )
+                    r2 = r1 = r0;
+            }
+            vr0 = _v256_extract_low(_v256_setall_ps(r0));
+            vr1 = _v256_extract_low(_v256_setall_ps(r1));
+            vr2 = _v256_extract_low(_v256_setall_ps(r2));
+        }
+
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const float* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i tmp;
+            __m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp),
+                   vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp);
+
+            for (; k < vecsize; k += 8, rptr += 8 )
+            {
+                __m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0);
+                __m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0);
+                __m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0);
+                __m256 r0 = (__m256)__lasx_xvld(rptr, 0);
+
+                vs00 = __lasx_xvfmadd_s(w0, r0, vs00);
+                vs10 = __lasx_xvfmadd_s(w1, r0, vs10);
+                vs20 = __lasx_xvfmadd_s(w2, r0, vs20);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0);
+                vs01 = __lasx_xvfmadd_s(w0, r0, vs01);
+                vs11 = __lasx_xvfmadd_s(w1, r0, vs11);
+                vs21 = __lasx_xvfmadd_s(w2, r0, vs21);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0);
+                vs02 = __lasx_xvfmadd_s(w0, r0, vs02);
+                vs12 = __lasx_xvfmadd_s(w1, r0, vs12);
+                vs22 = __lasx_xvfmadd_s(w2, r0, vs22);
+
+                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0);
+                vs03 = __lasx_xvfmadd_s(w0, r0, vs03);
+                vs13 = __lasx_xvfmadd_s(w1, r0, vs13);
+                vs23 = __lasx_xvfmadd_s(w2, r0, vs23);
+            }
+
+            /*t0*/
+            __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm);
+            __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
+            __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
+
+            __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm);
+            __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
+            __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
+
+            __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm);
+            __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
+            __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
+
+            __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm);
+            __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
+            __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
+
+            __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
+            __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
+            __m256         t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+            /*t1*/
+            __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm);
+            __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
+            __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
+
+            __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm);
+            __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
+            __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
+
+            __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm);
+            __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
+            __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
+
+            __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm);
+            __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
+            __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
+
+            __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
+            __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
+            __m256         t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+            /*t2*/
+            __m256  vs20_perm   = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm);
+            __m256  tmp20_srl   = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32);
+            __m256  vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl);
+
+            __m256  vs21_perm   = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm);
+            __m256  tmp21_srl   = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32);
+            __m256  vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl);
+
+            __m256  vs22_perm   = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm);
+            __m256  tmp22_srl   = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32);
+            __m256  vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl);
+
+            __m256  vs23_perm   = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1);
+            __m256  vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm);
+            __m256  tmp23_srl   = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32);
+            __m256  vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl);
+
+            __m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w);
+            __m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w);
+            __m256         t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20);
+
+            t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1));
+            t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1));
+            t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1));
+
+            __m128 s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _v256_extract_low(_v256_setall_ps(bias0));
+                s1 = _v256_extract_low(_v256_setall_ps(bias1));
+                s2 = _v256_extract_low(_v256_setall_ps(bias2));
+            }
+            else
+            {
+                s0 = (__m128)__lsx_vld(outptr0 + j, 0);
+                s1 = (__m128)__lsx_vld(outptr1 + j, 0);
+                s2 = (__m128)__lsx_vld(outptr2 + j, 0);
+            }
+
+            s0 = __lsx_vfadd_s(s0, *(__m128*)&t0);
+            s1 = __lsx_vfadd_s(s1, *(__m128*)&t1);
+            s2 = __lsx_vfadd_s(s2, *(__m128*)&t2);
+
+            if( relu )
+            {
+                __m128i m0 = __lsx_vfcmp_clt_s(z, s0);
+                __m128i m1 = __lsx_vfcmp_clt_s(z, s1);
+                __m128i m2 = __lsx_vfcmp_clt_s(z, s2);
+                s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0);
+                s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1);
+                s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2);
+            }
+
+            if( tail )
+            {
+                s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask);
+                s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask);
+                s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask);
+            }
+
+            __lsx_vst(s0, outptr0 + j, 0);
+            __lsx_vst(s1, outptr1 + j, 0);
+            __lsx_vst(s2, outptr2 + j, 0);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
+            const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            float s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
+                r = rptr1[k];
+                s01 += w0*r; s11 += w1*r; s21 += w2*r;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s01 = s01 > 0.f ? s01 : s01*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s11 = s11 > 0.f ? s11 : s11*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+                s21 = s21 > 0.f ? s21 : s21*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
+            float s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+}
+
+static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
+{
+    __m256 t0 = (__m256)__lasx_xvld(ptr, 0);
+    __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
+
+    __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
+    __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
+
+    a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
+    b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
+                   vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
+                   vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
+            __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
+            vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
+                           v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
+                           v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
+                           v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
+                           v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
+                           v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
+                           v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
+                           v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
+                           v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+    __m256i v256_tmp;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
+               vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
+
+            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
+            vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1);
+            vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2);
+            vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3);
+            vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4);
+            vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5);
+            vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6);
+            vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7);
+        }
+
+        /*s0*/
+        __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm);
+        __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
+        __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
+
+        __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm);
+        __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
+        __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
+
+        __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm);
+        __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
+        __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
+
+        __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm);
+        __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
+        __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
+
+        __m256i  vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
+        __m256i  vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
+        __m256          s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
+
+        /*s1*/
+        __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm);
+        __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
+        __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
+
+        __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm);
+        __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
+        __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
+
+        __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm);
+        __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
+        __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
+
+        __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm);
+        __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
+        __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
+
+        __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
+        __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
+        __m256         s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
+
+        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1));
+        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1));
+
+        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0));
+        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4));
+
+        __lsx_vst(*(__m128*)&s0, dst + i, 0);
+        __lsx_vst(*(__m128*)&s1, dst + i, 4*4);
+    }
+
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
+            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
+        }
+
+        __m256i vs0_perm   = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
+        __m256  vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm);
+        __m256i tmp_srl    = __lasx_xvsrli_d(vs0_add_2w, 32);
+        __m256  vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl);
+        temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4];
+        dst[i] = temp + bias[i];
+    }
+}
+
+
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb )
+{
+    int n = 0;
+
+    for( ; n <= nb - 16; n += 16 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            __m256i v256_tmp;
+            __m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+            __m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
+
+            for( int k = 0; k < na; k++ )
+            {
+                __m256 a0 = _v256_setall_ps(aptr0[k]);
+                __m256 a1 = _v256_setall_ps(aptr1[k]);
+                __m256 a2 = _v256_setall_ps(aptr2[k]);
+                __m256 a3 = _v256_setall_ps(aptr3[k]);
+
+                __m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0);
+                __m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0);
+                d00 = __lasx_xvfmadd_s(a0, b0, d00);
+                d01 = __lasx_xvfmadd_s(a0, b1, d01);
+                d10 = __lasx_xvfmadd_s(a1, b0, d10);
+                d11 = __lasx_xvfmadd_s(a1, b1, d11);
+                d20 = __lasx_xvfmadd_s(a2, b0, d20);
+                d21 = __lasx_xvfmadd_s(a2, b1, d21);
+                d30 = __lasx_xvfmadd_s(a3, b0, d30);
+                d31 = __lasx_xvfmadd_s(a3, b1, d31);
+            }
+
+            __lasx_xvst(d00, cptr0 + n, 0);
+            __lasx_xvst(d01, cptr0 + n, 8*4);
+            __lasx_xvst(d10, cptr1 + n, 0);
+            __lasx_xvst(d11, cptr1 + n, 8*4);
+            __lasx_xvst(d20, cptr2 + n, 0);
+            __lasx_xvst(d21, cptr2 + n, 8*4);
+            __lasx_xvst(d30, cptr3 + n, 0);
+            __lasx_xvst(d31, cptr3 + n, 8*4);
+        }
+    }
+
+    for( ; n < nb; n++ )
+    {
+        for( int m = 0; m < ma; m++ )
+        {
+            const float* aptr0 = aptr + astep*m;
+            float* cptr0 = cptr + cstep*m;
+            float d0 = 0.f;
+
+            for( int k = 0; k < na; k++ )
+                d0 += aptr0[k]*bptr[k*bstep + n];
+
+            cptr0[n] = d0;
+        }
+    }
+}
+
+#endif // CV_LASX
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -2178,6 +2178,9 @@ public:
    #if CV_TRY_SSE4_1
        bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
    #endif
+    #if CV_TRY_LASX
+        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
+    #endif

        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2241,6 +2244,10 @@ public:
                        if ( useAVX2 )
                            x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
                        #endif
+                        #if CV_TRY_LASX
+                        if ( useLASX )
+                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
+                        #endif
                        #if CV_SIMD128
                        {
                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
 #endif
 }

+namespace opt_LASX
+{
+#if CV_TRY_LASX
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+#endif
+}
+
 namespace opt_SSE4_1
 {
 #if CV_TRY_SSE4_1
--- a/modules/imgproc/src/imgwarp.lasx.cpp
+++ b/modules/imgproc/src/imgwarp.lasx.cpp
@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "imgwarp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv
+{
+namespace opt_LASX
+{
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    __m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1);
+    __m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0);
+    for (; x1 <= bw - 16; x1 += 16)
+    {
+        __m256i tx0, tx1, ty0, ty1;
+        tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX);
+        ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY);
+        tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX);
+        ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY);
+
+        tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS);
+        ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS);
+        tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS);
+        ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS);
+
+        __m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask),
+            __lasx_xvand_v(tx1, fxy_mask));
+        __m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask),
+            __lasx_xvand_v(ty1, fxy_mask));
+        tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS),
+            __lasx_xvsrai_w(tx1, INTER_BITS));
+        ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS),
+            __lasx_xvsrai_w(ty1, INTER_BITS));
+        fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS));
+        fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
+
+        __lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0);
+        __lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2);
+        __lasx_xvst(fx_, (__m256i*)(alpha + x1), 0);
+    }
+    return x1;
+}
+
+}
+}
+/* End of file. */
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
            opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
    }
    else
+#endif
+#if CV_TRY_LASX
+    if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4)))
+    {
+        if(pix_size == 2)
+            opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify);
+        else
+            opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify);
+    }
+    else
 #endif
    {
        resizeNNInvoker invoker(src, dst, x_ofs, ify);
--- a/modules/imgproc/src/resize.hpp
+++ b/modules/imgproc/src/resize.hpp
@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double);
 int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width);
 #endif
 }
+
+namespace opt_LASX
+{
+#if CV_TRY_LASX
+void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double);
+void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double);
+#endif
+}
+
 }
 #endif
 /* End of file. */
--- a/modules/imgproc/src/resize.lasx.cpp
+++ b/modules/imgproc/src/resize.lasx.cpp
@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Geometrical transforms on images and matrices: rotation, zoom etc.
+//
+// */
+
+#include "precomp.hpp"
+#include "resize.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv
+{
+namespace opt_LASX
+{
+
+class resizeNNInvokerLASX4 CV_FINAL :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
+        ify(_ify)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int avxWidth = width - (width & 0x7);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
+                    __lasx_xvst(pixels, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 8)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
+                    __lasx_xvst(pixels, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
+                }
+            }
+        }
+    }
+
+private:
+    const Mat& src;
+    Mat& dst;
+    int* x_ofs;
+    double ify;
+
+    resizeNNInvokerLASX4(const resizeNNInvokerLASX4&);
+    resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&);
+};
+
+class resizeNNInvokerLASX2 CV_FINAL :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
+        ify(_ify)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x;
+        int width = dsize.width;
+        int avxWidth = width - (width & 0xf);
+        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
+                                                                     15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
+        const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0);
+        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
+
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
+
+                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
+                    __lasx_xvst(ints_permuted, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+
+            }
+        }
+        else
+        {
+            for(y = range.start; y < range.end; y++)
+            {
+                uchar* D = dst.data + dst.step*y;
+                uchar* Dstart = D;
+                int sy = std::min(cvFloor(y*ify), ssize.height-1);
+                const uchar* S = src.data + sy*src.step;
+                const uchar* S2 = S - 2;
+#ifdef CV_ICC
+#pragma unroll(4)
+#endif
+                for(x = 0; x < avxWidth; x += 16)
+                {
+                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
+                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
+
+                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
+                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
+
+                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
+                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
+
+                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
+                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
+                    __lasx_xvst(ints_permuted, (int*)D, 0);
+                    D += 32;
+                }
+                for(; x < width; x++)
+                {
+                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
+                }
+            }
+        }
+    }
+
+private:
+    const Mat& src;
+    Mat& dst;
+    int* x_ofs;
+    double ify;
+
+    resizeNNInvokerLASX2(const resizeNNInvokerLASX2&);
+    resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&);
+};
+
+void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
+{
+    resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
+{
+    resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+}
+
+}
+}
+/* End of file. */