Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX

* Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX * Add resize.lasx.cpp for Loongson SIMD acceleration * Add imgwarp.lasx.cpp for Loongson SIMD acceleration * Add LASX acceleration support for dnn/conv * Add CV_PAUSE(v) for Loongarch * Set LASX by default on Loongarch64 * LoongArch: tune test threshold for Core/HAL.mat_decomp/15 Co-authored-by: shengwenxue <shengwenxue@loongson.cn>
2 years ago · 4154bd0667
parent 866191478f
commit 4154bd0667
24 changed files with 5071 additions and 6 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
 list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
 ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@ -380,6 +381,12 @@ elseif(RISCV)
  set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}")
  set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}")
 elseif(LOONGARCH64)
  ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
  ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
  set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
 endif()
 # Helper values for cmake-gui
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
  set(MIPS 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)")
  set(RISCV 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)")
  set(LOONGARCH64 1)
 else()
  if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR)
    message(WARNING "OpenCV: unrecognized target processor configuration")
--- a/cmake/checks/cpu_lasx.cpp
+++ b/cmake/checks/cpu_lasx.cpp
@ -0,0 +1,23 @@
 #include <stdio.h>
 #if defined(__loongarch_asx)
 #  include <lasxintrin.h>
 #  define CV_LASX 1
 #endif
 #if defined CV_LASX
 int test()
 {
    const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
    v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0);
    return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7);
 }
 #else
 #error "LASX is not supported"
 #endif
 int main()
 {
  printf("%d\n", test());
  return 0;
 }
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -172,6 +172,11 @@
 #  define CV_MSA 1
 #endif
 #ifdef CV_CPU_COMPILE_LASX
 #  include <lasxintrin.h>
 #  define CV_LASX 1
 #endif
 #ifdef __EMSCRIPTEN__
 #  define CV_WASM_SIMD 1
 #  include <wasm_simd128.h>
@ -370,3 +375,7 @@ struct VZeroUpperGuard {
 #ifndef CV_RVV
 #  define CV_RVV 0
 #endif
 #ifndef CV_LASX
 #  define CV_LASX 0
 #endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -525,5 +525,26 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
 #  define CV_TRY_LASX 1
 #  define CV_CPU_FORCE_LASX 1
 #  define CV_CPU_HAS_SUPPORT_LASX 1
 #  define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
 #  define CV_TRY_LASX 1
 #  define CV_CPU_FORCE_LASX 0
 #  define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
 #  define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
 #  define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
 #else
 #  define CV_TRY_LASX 0
 #  define CV_CPU_FORCE_LASX 0
 #  define CV_CPU_HAS_SUPPORT_LASX 0
 #  define CV_CPU_CALL_LASX(fn, args)
 #  define CV_CPU_CALL_LASX_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...)  CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -279,6 +279,8 @@ namespace cv {
 #define CV_CPU_RVV              210
 #define CV_CPU_LASX             230
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
 #define CV_CPU_AVX512_COMMON    257
@ -336,6 +338,8 @@ enum CpuFeatures {
    CPU_RVV             = 210,
    CPU_LASX             = 230,
    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
    CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
 #include "opencv2/core/hal/intrin_rvv.hpp"
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
 #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
 #elif CV_LASX
    #if !defined(CV_FORCE_SIMD128_CPP)
    #define CV_FORCE_SIMD128_CPP 1
    #endif
 #include "opencv2/core/hal/intrin_cpp.hpp"
 #else
 #include "opencv2/core/hal/intrin_cpp.hpp"
@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
 #if CV_LASX
 #define CV__SIMD_FORWARD 256
 #include "opencv2/core/hal/intrin_forward.hpp"
 #include "opencv2/core/hal/intrin_lasx.hpp"
 #endif
 //! @cond IGNORED
 namespace cv {
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
 // https://github.com/riscv/riscv-isa-manual/issues/43
 // #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0)
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
 # elif defined __GNUC__ && defined __loongarch__
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
 # else
 #   warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags."
 #   define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0)
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -434,6 +434,8 @@ struct HWFeatures
        g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
        g_hwFeatureNames[CPU_RVV] = "RVV";
        g_hwFeatureNames[CPU_LASX] = "LASX";
    }
    void initialize(void)
@ -689,6 +691,10 @@ struct HWFeatures
        have[CV_CPU_RVV] = true;
    #endif
    #if defined __loongarch_asx
        have[CV_CPU_LASX] = true;
    #endif
        bool skip_baseline_check = false;
 #ifndef NO_GETENV
        if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK"))
--- a/modules/core/test/test_hal_core.cpp
+++ b/modules/core/test/test_hal_core.cpp
@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp)
        int size = (hcase / 2) % 4;
        size = size == 0 ? 3 : size == 1 ? 4  : size == 2 ? 6 : 15;
        int nfunc = (hcase / 8);
    #if CV_LASX
        double eps = depth == CV_32F ? 1e-5 : 2e-10;
    #else
        double eps = depth == CV_32F ? 1e-5 : 1e-10;
    #endif
        if( size == 3 )
            return; // TODO ???
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -8,8 +8,8 @@ endif()
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
-ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
-ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@ -579,13 +579,14 @@ public:
        bool is1x1_;
        bool useAVX2;
        bool useAVX512;
        bool useLASX;
        int blk_size_cn;
        int inpZp, outZp;
        const std::vector<float>* multiplier;
        ParallelConv()
            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
            , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
        {}
@ -641,6 +642,8 @@ public:
            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
            p.useLASX   = checkHardwareSupport(CPU_LASX) && isConv2D;
            int kernel_d = isConv3D? kernel_size[0] : 1;
            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
            int kernel_w = kernel_size.back();
@ -837,6 +840,13 @@ public:
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                            else
                        #endif
                        #if CV_TRY_LASX
                            if(useLASX)
                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                            else
                        #endif
                            {
                                const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1210,6 +1220,12 @@ public:
                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                        else
                    #endif
                    #if CV_TRY_LASX
                        if(useLASX)
                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                        else
                    #endif
                        for( int i = 0; i < outCn; i += 2 )
                        {
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@ -226,7 +226,7 @@ public:
    {
    public:
        FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
-                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
                        const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@ -250,6 +250,7 @@ public:
            p.activ = !activationLUT.empty() ? activ : 0;
            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            p.useLASX = checkHardwareSupport(CPU_LASX);
            parallel_for_(Range(0, nstripes), p, nstripes);
        }
@ -294,6 +295,11 @@ public:
                if( useAVX2 )
                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                else
            #endif
            #if CV_TRY_LASX
                if( useLASX )
                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                else
            #endif
                {
                    int i = 0;
@ -349,6 +355,7 @@ public:
        int nstripes, outZp;
        bool useAVX2;
        bool useAVX512;
        bool useLASX;
    };
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
--- a/modules/dnn/src/int8layers/layers_common.simd.hpp
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
 }
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
 inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c)
 {
    __m256i vzero = __lasx_xvreplgr2vr_d(0);
    __m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b);
    __m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b);
    __m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16);
    __m256i  odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16);
    return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c);
 }
 enum { FASCONV_BASE_VECSZ = 4 };
 void fastConv( const int8_t* weights, size_t wstep, const int* bias,
               const int8_t* rowbuf, int* output, const int* outShape,
               int blockSize, int vecsize, int vecsize_aligned, int outZp,
               const float* multiplier, bool initOutput, bool finalOutput )
 {
    int outCn = outShape[1];
    size_t outPlaneSize = outShape[2]*outShape[3];
    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
    int rsz = blockSize % FASCONV_BASE_VECSZ;
    for( int i = 0; i < rsz; i++ )
        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
    // now compute dot product of the weights
    // and im2row-transformed part of the tensor
    for( int i = 0; i < outCn; i += 3 )
    {
        const int8_t* wptr0 = weights + i*wstep;
        const int8_t* wptr1 = wptr0 + wstep;
        const int8_t* wptr2 = wptr1 + wstep;
        int* outptr0 = output + i*outPlaneSize;
        int* outptr1 = outptr0 + outPlaneSize;
        int* outptr2 = outptr1 + outPlaneSize;
        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
        if( i+2 >= outCn )
        {
            wptr2 = wptr1;
            outptr2 = outptr1;
            bias2 = bias1;
            mult2 = mult1;
            if( i+1 >= outCn )
            {
                wptr2 = wptr1 = wptr0;
                outptr2 = outptr1 = outptr0;
                bias2 = bias1 = bias0;
                mult2 = mult1 = mult0;
            }
        }
        int j = 0;
        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
        {
            bool tail = false;
            if (j + FASCONV_BASE_VECSZ > blockSize)
            {
                if (j == 0)
                    break;
                j = blockSize - FASCONV_BASE_VECSZ;
                tail = true;
            }
            int k = 0;
            const int8_t* rptr = rowbuf + j*vecsize_aligned;
            __m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0),
                    vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0),
                    vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0),
                    vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0),
                    vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0),
                    vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0);
            for (; k < vecsize; k += 32, rptr += 32 )
            {
                __m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0);
                __m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0);
                __m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0);
                __m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0);
                vs00 = _v256_fmadds8_s32(w0, r0, vs00);
                vs10 = _v256_fmadds8_s32(w1, r0, vs10);
                vs20 = _v256_fmadds8_s32(w2, r0, vs20);
                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0);
                vs01 = _v256_fmadds8_s32(w0, r0, vs01);
                vs11 = _v256_fmadds8_s32(w1, r0, vs11);
                vs21 = _v256_fmadds8_s32(w2, r0, vs21);
                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0);
                vs02 = _v256_fmadds8_s32(w0, r0, vs02);
                vs12 = _v256_fmadds8_s32(w1, r0, vs12);
                vs22 = _v256_fmadds8_s32(w2, r0, vs22);
                r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0);
                vs03 = _v256_fmadds8_s32(w0, r0, vs03);
                vs13 = _v256_fmadds8_s32(w1, r0, vs13);
                vs23 = _v256_fmadds8_s32(w2, r0, vs23);
            }
            /*t0*/
            __m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00);
            __m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w);
            __m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01);
            __m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w);
            __m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02);
            __m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w);
            __m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03);
            __m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w);
            __m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d);
            __m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d);
            __m256i        t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00);
            /*t1*/
            __m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10);
            __m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w);
            __m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11);
            __m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w);
            __m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12);
            __m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w);
            __m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13);
            __m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w);
            __m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d);
            __m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d);
            __m256i        t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10);
            /*t2*/
            __m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20);
            __m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w);
            __m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21);
            __m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w);
            __m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22);
            __m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w);
            __m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23);
            __m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w);
            __m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d);
            __m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d);
            __m256i        t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20);
            t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1));
            t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1));
            t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1));
            __m128i s0, s1, s2;
            if( initOutput )
            {
                s0 = __lsx_vreplgr2vr_w(bias0);
                s1 = __lsx_vreplgr2vr_w(bias1);
                s2 = __lsx_vreplgr2vr_w(bias2);
            }
            else
            {
                s0 = __lsx_vld((__m128i*)(outptr0 + j), 0);
                s1 = __lsx_vld((__m128i*)(outptr1 + j), 0);
                s2 = __lsx_vld((__m128i*)(outptr2 + j), 0);
            }
            s0 = __lsx_vadd_w(s0, *(__m128i*)&t0);
            s1 = __lsx_vadd_w(s1, *(__m128i*)&t1);
            s2 = __lsx_vadd_w(s2, *(__m128i*)&t2);
            if( finalOutput )
            {
                __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
                __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
                __m256 v_mult0 = _v256_setall_ps(mult0);
                __m256 v_mult1 = _v256_setall_ps(mult1);
                __m256 v_mult2 = _v256_setall_ps(mult2);
                s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0)));
                s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1)));
                s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2)));
                s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax);
                s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax);
                s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax);
            }
            if( tail )
            {
                s0 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0),  s0, mask);
                s1 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0),  s1, mask);
                s2 =  __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0),  s2, mask);
            }
            __lsx_vst(s0, (__m128i*)(outptr0 + j), 0);
            __lsx_vst(s1, (__m128i*)(outptr1 + j), 0);
            __lsx_vst(s2, (__m128i*)(outptr2 + j), 0);
        }
        for( ; j <= blockSize - 2; j += 2 )
        {
            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
            const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
            int s00, s01, s10, s11, s20, s21;
            if( initOutput )
            {
                s00 = s01 = bias0;
                s10 = s11 = bias1;
                s20 = s21 = bias2;
            }
            else
            {
                s00 = outptr0[j]; s01 = outptr0[j+1];
                s10 = outptr1[j]; s11 = outptr1[j+1];
                s20 = outptr2[j]; s21 = outptr2[j+1];
            }
            for( int k = 0; k < vecsize; k++ )
            {
                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
                int8_t r = rptr0[k];
                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
                r = rptr1[k];
                s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
            }
            if( finalOutput )
            {
                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
                s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
                s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
                s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
            }
            outptr0[j] = s00;
            outptr0[j+1] = s01;
            outptr1[j] = s10;
            outptr1[j+1] = s11;
            outptr2[j] = s20;
            outptr2[j+1] = s21;
        }
        for( ; j < blockSize; j++ )
        {
            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
            int s00, s10, s20;
            if( initOutput )
            {
                s00 = bias0;
                s10 = bias1;
                s20 = bias2;
            }
            else
            {
                s00 = outptr0[j];
                s10 = outptr1[j];
                s20 = outptr2[j];
            }
            for( int k = 0; k < vecsize; k++ )
            {
                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
                int8_t r = rptr0[k];
                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
            }
            if( finalOutput )
            {
                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
            }
            outptr0[j] = s00;
            outptr1[j] = s10;
            outptr2[j] = s20;
        }
    }
 }
 static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b,
                                         __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
 {
    __m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0);
    __m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0);
    __m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0);
    __m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0);
    __m256i a0b0 = __lasx_xvmul_h(a0, b0);
    __m256i a1b1 = __lasx_xvmul_h(a1, b1);
    out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0));
    out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0));
    out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0));
    out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0));
 }
 static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
 {
    __m256i t0 = __lasx_xvld((const __m256i*)ptr, 0);
    __m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1);
    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    __m256i p0 = __lasx_xvshuf_b(t0, t0, sh);
    __m256i p1 = __lasx_xvshuf_b(t1, t1, sh);
    __m256i lo = __lasx_xvpermi_q(p0, p1, 0x02);
    __m256i hi = __lasx_xvpermi_q(p0, p1, 0x13);
    a = __lasx_xvilvl_d(hi, lo);
    b = __lasx_xvilvh_d(hi, lo);
 }
 void fastDepthwiseConv( const int8_t* wptr,
                     int kernel_h, int kernel_w,
                     int stride_h, int stride_w,
                     int dilation_h, int dilation_w,
                     int pad_t, int pad_l,
                     const int* biasptr, const float* multptr,
                     const int8_t* inptr_,
                     int height, int width,
                     int* outptr_,
                     int out_d, int outH, int outW,
                     int inpZp, int outZp)
 {
    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
    float mult = multptr[out_d];
    int bias = biasptr[out_d];
    int biasCopy;
    for (int out_i = 0; out_i < outH; out_i++)
    {
        int in_i = out_i * stride_h - pad_t, out_j = 0;
        const int8_t* imgptr0 = inptr_ + in_i*width;
        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
        int out;
        biasCopy = bias;
        if (in_i < 0)
        {
            biasCopy += inpZp * (w00 + w01 + w02);
            w00 = w01 = w02 = 0;
            imgptr0 = imgptr1;
        }
        else if (in_i + dilation_h*(kernel_h-1) >= height)
        {
            biasCopy += inpZp * (w20 + w21 + w22);
            w20 = w21 = w22 = 0;
            imgptr2 = imgptr1;
        }
        int* outptr = outptr_ + out_i*outW;
        if (pad_l > 0)
        {
            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
                  biasCopy + inpZp*(w00 + w10 + w20);
            outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
            out_j = 1;
        }
        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
        {
            const int VECSZ = 32;
            __m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02),
                    vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12),
                    vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22);
            __m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp),
                    outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127);
            __m256 vmult = _v256_setall_ps(mult);
            __m256i vout0, vout1, vout2, vout3;
            if( stride_w == 1 )
            {
                for( ; out_j < outW1; out_j += VECSZ )
                {
                    if (out_j + VECSZ > outW1)
                    {
                        if (out_j <= pad_l)
                            break;
                        out_j = outW1 - VECSZ;
                    }
                    int in_j = out_j * stride_w - pad_l;
                    __m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0),
                            v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0),
                            v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0),
                            v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0),
                            v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0),
                            v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0),
                            v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0),
                            v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0),
                            v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0);
                    vout0 = vout1 = vout2 = vout3 = vbias;
                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
                }
            }
            else
            {
                for( ; out_j < outW1; out_j += VECSZ )
                {
                    if (out_j + VECSZ > outW1)
                    {
                        if (out_j <= pad_l)
                            break;
                        out_j = outW1 - VECSZ;
                    }
                    int in_j = out_j * stride_w - pad_l;
                    __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
                    vout0 = vout1 = vout2 = vout3 = vbias;
                    _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
                    _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
                    vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
                    vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
                    vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
                    vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
                    vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
                    vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
                    vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
                    vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
                    __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
                    __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
                    __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
                    __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
                }
            }
        }
        for (; out_j < outW1; out_j++)
        {
            int in_j = out_j * stride_w - pad_l;
            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
        }
        for (; out_j < outW; out_j++ )
        {
            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
            int s0 = 1, s1 = 1, s2 = 1;
            if (in_j0 >= width)
            {
                in_j0 = 0;
                s0 = 0;
                biasCopy += inpZp*(w00 + w10 + w20);
            }
            if (in_j1 >= width)
            {
                in_j1 = 0;
                s1 = 0;
                biasCopy += inpZp*(w01 + w11 + w21);
            }
            if (in_j2 >= width)
            {
                in_j2 = 0;
                s2 = 0;
                biasCopy += inpZp*(w02 + w12 + w22);
            }
            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
        }
    }
 }
 // dst = vec * weights^t + bias
 void fastGEMM1T( const int8_t* vec, const int8_t* weights,
                 size_t wstep, const int* bias, const float* multiplier,
                 int* dst, int nvecs, int vecsize, int outZp )
 {
    int i = 0;
    for( ; i <= nvecs - 8; i += 8 )
    {
        const int8_t* wptr = weights + i*wstep;
        __m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0),
                vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0),
                vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0),
                vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0);
        __m128i voutzp = __lsx_vreplgr2vr_w(outZp);
        __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
        {
            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
            vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
            vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1);
            vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2);
            vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3);
            vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4);
            vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5);
            vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6);
            vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7);
        }
        /*s0*/
        __m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
        __m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w);
        __m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1);
        __m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w);
        __m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2);
        __m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w);
        __m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3);
        __m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w);
        __m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d);
        __m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d);
        __m256i      s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0);
        /*s1*/
        __m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4);
        __m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w);
        __m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5);
        __m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w);
        __m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6);
        __m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w);
        __m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7);
        __m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w);
        __m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d);
        __m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d);
        __m256i      s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4);
        s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1));
        s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1));
        __m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0));
        __m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4));
        t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0))));
        t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4))));
        t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax);
        t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax);
        __lsx_vst(t0, (__m128i*)(dst + i), 0);
        __lsx_vst(t1, (__m128i*)(dst + i), 4*4);
    }
    for( ; i < nvecs; i++ )
    {
        const int8_t* wptr = weights + i*wstep;
        __m256i vs0 = __lasx_xvreplgr2vr_d(0);
        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
        {
            __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
                  vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
        }
        __m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
        int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3];
        dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
    }
 }
 #endif // CV_LASX
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -986,12 +986,13 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
        bool useLASX;
        int blk_size_cn;
        ParallelConv()
            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
              biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
-            , blk_size_cn(0)
+            , useLASX(false), blk_size_cn(0)
        {}
        static void run( const Mat& input, Mat& output, const Mat& weights,
@ -1049,6 +1050,7 @@ public:
            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
            p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
            p.useLASX  = checkHardwareSupport(CPU_LASX) && isConv2D;
            int kernel_d = isConv3D? kernel_size[0] : 1;
            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@ -1256,6 +1258,13 @@ public:
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
                            else
                        #endif
                        #if CV_TRY_LASX
                            if(useLASX)
                                opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
                            else
                        #endif
                            {
                                const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1631,6 +1640,12 @@ public:
                            opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                         outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                        else
                    #endif
                    #if CV_TRY_LASX
                        if(useLASX)
                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                        else
                    #endif
                        for( int i = 0; i < outCn; i += 2 )
                        {
@ -2437,6 +2452,7 @@ public:
            useAVX2 = checkHardwareSupport(CPU_AVX2);
            useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            useRVV = checkHardwareSupport(CPU_RVV);
            useLASX = checkHardwareSupport(CPU_LASX);
        }
        void operator()(const Range& range_) const CV_OVERRIDE
@ -2474,6 +2490,11 @@ public:
                opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
            }
            else
        #endif
        #if CV_TRY_LASX
            if( useLASX )
                opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
            else
        #endif
            for( m = 0; m < mmax; m += 2 )
            {
@ -2574,6 +2595,7 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
        bool useLASX;
    };
    class Col2ImInvoker : public cv::ParallelLoopBody
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -173,7 +173,7 @@ public:
    class FullyConnected : public ParallelLoopBody
    {
    public:
-        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {}
        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
                        Mat& dstMat, const ActivationLayer* activ, int nstripes)
@ -197,6 +197,7 @@ public:
            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            p.useRVV = checkHardwareSupport(CPU_RVV);
            p.useLASX = checkHardwareSupport(CPU_LASX);
            parallel_for_(Range(0, nstripes), p, nstripes);
        }
@ -250,6 +251,11 @@ public:
                if( useRVV )
                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
            #if CV_TRY_LASX
                if( useLASX )
                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
                {
                    int i = 0;
@ -305,6 +311,7 @@ public:
        bool useAVX2;
        bool useAVX512;
        bool useRVV;
        bool useLASX;
    };
 #ifdef HAVE_OPENCL
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr,
 #endif // CV_RVV
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
 enum { FASCONV_BASE_VECSZ = 4 };
 void fastConv( const float* weights, size_t wstep, const float* bias,
               const float* rowbuf, float* output, const int* outShape,
               int blockSize, int vecsize, int vecsize_aligned,
               const float* relu, bool initOutput )
 {
    int outCn = outShape[1];
    size_t outPlaneSize = outShape[2]*outShape[3];
    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
    __m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f);
    __m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2;
    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
    int rsz = blockSize % FASCONV_BASE_VECSZ;
    for( int i = 0; i < rsz; i++ )
        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
    __m128i mask = __lsx_vld((const float*)maskbuf, 0);
    // now compute dot product of the weights
    // and im2row-transformed part of the tensor
    for( int i = 0; i < outCn; i += 3 )
    {
        const float* wptr0 = weights + i*wstep;
        const float* wptr1 = wptr0 + wstep;
        const float* wptr2 = wptr1 + wstep;
        float* outptr0 = output + i*outPlaneSize;
        float* outptr1 = outptr0 + outPlaneSize;
        float* outptr2 = outptr1 + outPlaneSize;
        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
        if( i+2 >= outCn )
        {
            wptr2 = wptr1;
            outptr2 = outptr1;
            bias2 = bias1;
            if( i+1 >= outCn )
            {
                wptr2 = wptr1 = wptr0;
                outptr2 = outptr1 = outptr0;
                bias2 = bias1 = bias0;
            }
        }
        if( relu )
        {
            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
            if( i+2 >= outCn )
            {
                r2 = r1;
                if( i+1 >= outCn )
                    r2 = r1 = r0;
            }
            vr0 = _v256_extract_low(_v256_setall_ps(r0));
            vr1 = _v256_extract_low(_v256_setall_ps(r1));
            vr2 = _v256_extract_low(_v256_setall_ps(r2));
        }
        int j = 0;
        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
        {
            bool tail = false;
            if (j + FASCONV_BASE_VECSZ > blockSize)
            {
                if (j == 0)
                    break;
                j = blockSize - FASCONV_BASE_VECSZ;
                tail = true;
            }
            int k = 0;
            const float* rptr = rowbuf + j*vecsize_aligned;
            __m256i tmp;
            __m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp),
                   vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp),
                   vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp),
                   vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp),
                   vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp),
                   vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp);
            for (; k < vecsize; k += 8, rptr += 8 )
            {
                __m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0);
                __m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0);
                __m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0);
                __m256 r0 = (__m256)__lasx_xvld(rptr, 0);
                vs00 = __lasx_xvfmadd_s(w0, r0, vs00);
                vs10 = __lasx_xvfmadd_s(w1, r0, vs10);
                vs20 = __lasx_xvfmadd_s(w2, r0, vs20);
                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0);
                vs01 = __lasx_xvfmadd_s(w0, r0, vs01);
                vs11 = __lasx_xvfmadd_s(w1, r0, vs11);
                vs21 = __lasx_xvfmadd_s(w2, r0, vs21);
                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0);
                vs02 = __lasx_xvfmadd_s(w0, r0, vs02);
                vs12 = __lasx_xvfmadd_s(w1, r0, vs12);
                vs22 = __lasx_xvfmadd_s(w2, r0, vs22);
                r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0);
                vs03 = __lasx_xvfmadd_s(w0, r0, vs03);
                vs13 = __lasx_xvfmadd_s(w1, r0, vs13);
                vs23 = __lasx_xvfmadd_s(w2, r0, vs23);
            }
            /*t0*/
            __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm);
            __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
            __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
            __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm);
            __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
            __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
            __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm);
            __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
            __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
            __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm);
            __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
            __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
            __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
            __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
            __m256         t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
            /*t1*/
            __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm);
            __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
            __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
            __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm);
            __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
            __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
            __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm);
            __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
            __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
            __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm);
            __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
            __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
            __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
            __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
            __m256         t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
            /*t2*/
            __m256  vs20_perm   = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm);
            __m256  tmp20_srl   = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32);
            __m256  vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl);
            __m256  vs21_perm   = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm);
            __m256  tmp21_srl   = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32);
            __m256  vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl);
            __m256  vs22_perm   = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm);
            __m256  tmp22_srl   = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32);
            __m256  vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl);
            __m256  vs23_perm   = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1);
            __m256  vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm);
            __m256  tmp23_srl   = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32);
            __m256  vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl);
            __m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w);
            __m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w);
            __m256         t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20);
            t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1));
            t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1));
            t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1));
            __m128 s0, s1, s2;
            if( initOutput )
            {
                s0 = _v256_extract_low(_v256_setall_ps(bias0));
                s1 = _v256_extract_low(_v256_setall_ps(bias1));
                s2 = _v256_extract_low(_v256_setall_ps(bias2));
            }
            else
            {
                s0 = (__m128)__lsx_vld(outptr0 + j, 0);
                s1 = (__m128)__lsx_vld(outptr1 + j, 0);
                s2 = (__m128)__lsx_vld(outptr2 + j, 0);
            }
            s0 = __lsx_vfadd_s(s0, *(__m128*)&t0);
            s1 = __lsx_vfadd_s(s1, *(__m128*)&t1);
            s2 = __lsx_vfadd_s(s2, *(__m128*)&t2);
            if( relu )
            {
                __m128i m0 = __lsx_vfcmp_clt_s(z, s0);
                __m128i m1 = __lsx_vfcmp_clt_s(z, s1);
                __m128i m2 = __lsx_vfcmp_clt_s(z, s2);
                s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0);
                s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1);
                s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2);
            }
            if( tail )
            {
                s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask);
                s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask);
                s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask);
            }
            __lsx_vst(s0, outptr0 + j, 0);
            __lsx_vst(s1, outptr1 + j, 0);
            __lsx_vst(s2, outptr2 + j, 0);
        }
        for( ; j <= blockSize - 2; j += 2 )
        {
            const float* rptr0 = rowbuf + j*vecsize_aligned;
            const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
            float s00, s01, s10, s11, s20, s21;
            if( initOutput )
            {
                s00 = s01 = bias0;
                s10 = s11 = bias1;
                s20 = s21 = bias2;
            }
            else
            {
                s00 = outptr0[j]; s01 = outptr0[j+1];
                s10 = outptr1[j]; s11 = outptr1[j+1];
                s20 = outptr2[j]; s21 = outptr2[j+1];
            }
            for( int k = 0; k < vecsize; k++ )
            {
                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
                float r = rptr0[k];
                s00 += w0*r; s10 += w1*r; s20 += w2*r;
                r = rptr1[k];
                s01 += w0*r; s11 += w1*r; s21 += w2*r;
            }
            if( relu )
            {
                s00 = s00 > 0.f ? s00 : s00*r0;
                s01 = s01 > 0.f ? s01 : s01*r0;
                s10 = s10 > 0.f ? s10 : s10*r1;
                s11 = s11 > 0.f ? s11 : s11*r1;
                s20 = s20 > 0.f ? s20 : s20*r2;
                s21 = s21 > 0.f ? s21 : s21*r2;
            }
            outptr0[j] = s00;
            outptr0[j+1] = s01;
            outptr1[j] = s10;
            outptr1[j+1] = s11;
            outptr2[j] = s20;
            outptr2[j+1] = s21;
        }
        for( ; j < blockSize; j++ )
        {
            const float* rptr0 = rowbuf + j*vecsize_aligned;
            float s00, s10, s20;
            if( initOutput )
            {
                s00 = bias0;
                s10 = bias1;
                s20 = bias2;
            }
            else
            {
                s00 = outptr0[j];
                s10 = outptr1[j];
                s20 = outptr2[j];
            }
            for( int k = 0; k < vecsize; k++ )
            {
                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
                float r = rptr0[k];
                s00 += w0*r; s10 += w1*r; s20 += w2*r;
            }
            if( relu )
            {
                s00 = s00 > 0.f ? s00 : s00*r0;
                s10 = s10 > 0.f ? s10 : s10*r1;
                s20 = s20 > 0.f ? s20 : s20*r2;
            }
            outptr0[j] = s00;
            outptr1[j] = s10;
            outptr2[j] = s20;
        }
    }
 }
 static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
 {
    __m256 t0 = (__m256)__lasx_xvld(ptr, 0);
    __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
    __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
    __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
    a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
    b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
 }
 void fastDepthwiseConv( const float* wptr,
                     int kernel_h, int kernel_w,
                     int stride_h, int stride_w,
                     int dilation_h, int dilation_w,
                     int pad_t, int pad_l,
                     const float* biasptr, const float* relu,
                     const float* inptr_,
                     int height, int width,
                     float* outptr_,
                     int out_d, int outH, int outW )
 {
    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
    for (int out_i = 0; out_i < outH; out_i++)
    {
        int in_i = out_i * stride_h - pad_t, out_j = 0;
        const float* imgptr0 = inptr_ + in_i*width;
        const float* imgptr1 = imgptr0 + dilation_h*width;
        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
        float out, w00 = w00_, w01 = w01_, w02 = w02_;
        float w20 = w20_, w21 = w21_, w22 = w22_;
        if (in_i < 0)
        {
            w00 = w01 = w02 = 0.f;
            imgptr0 = imgptr1;
        }
        else if (in_i + dilation_h*(kernel_h-1) >= height)
        {
            w20 = w21 = w22 = 0.f;
            imgptr2 = imgptr1;
        }
        float* outptr = outptr_ + out_i*outW;
        if (pad_l > 0)
        {
            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
            if (relu)
                out = out > 0.f ? out : out*relu_coeff;
            outptr[0] = out;
            out_j = 1;
        }
        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
        {
            const int VECSZ = 8;
            __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
                   vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
                   vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
            __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
            vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
            if( stride_w == 1 )
                for( ; out_j < outW1; out_j += VECSZ )
                {
                    if (out_j + VECSZ > outW1 && out_j > pad_l)
                        out_j = outW1 - VECSZ;
                    int in_j = out_j * stride_w - pad_l;
                    __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
                           v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
                           v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
                           v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
                           v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
                           v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
                           v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
                           v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
                           v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
                    if (relu)
                    {
                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
                    }
                    __lasx_xvst(vout0, outptr + out_j, 0);
                }
            else
                for( ; out_j < outW1; out_j += VECSZ )
                {
                    if (out_j + VECSZ > outW1 && out_j > pad_l)
                        out_j = outW1 - VECSZ;
                    int in_j = out_j * stride_w - pad_l;
                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
                    if (relu)
                    {
                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
                    }
                    __lasx_xvst(vout0, outptr + out_j, 0);
                }
        }
        for (; out_j < outW1; out_j++)
        {
            int in_j = out_j * stride_w - pad_l;
            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
            if (relu)
                out = out > 0.f ? out : out*relu_coeff;
            outptr[out_j] = out;
        }
        for (; out_j < outW; out_j++ )
        {
            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
            if (in_j0 >= width)
            {
                in_j0 = 0;
                s0 = 0.f;
            }
            if (in_j1 >= width)
            {
                in_j1 = 0;
                s1 = 0.f;
            }
            if (in_j2 >= width)
            {
                in_j2 = 0;
                s2 = 0.f;
            }
            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
            if (relu)
                out = out > 0.f ? out : out*relu_coeff;
            outptr[out_j] = out;
        }
    }
 }
 // dst = vec * weights^t + bias
 void fastGEMM1T( const float* vec, const float* weights,
                 size_t wstep, const float* bias,
                 float* dst, int nvecs, int vecsize )
 {
    int i = 0;
    __m256i v256_tmp;
    for( ; i <= nvecs - 8; i += 8 )
    {
        const float* wptr = weights + i*wstep;
        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
               vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
               vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
               vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
            vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1);
            vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2);
            vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3);
            vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4);
            vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5);
            vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6);
            vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7);
        }
        /*s0*/
        __m256  vs00_perm   = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm);
        __m256  tmp00_srl   = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
        __m256  vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
        __m256  vs01_perm   = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm);
        __m256  tmp01_srl   = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
        __m256  vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
        __m256  vs02_perm   = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm);
        __m256  tmp02_srl   = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
        __m256  vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
        __m256  vs03_perm   = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm);
        __m256  tmp03_srl   = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
        __m256  vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
        __m256i  vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
        __m256i  vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
        __m256          s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
        /*s1*/
        __m256  vs10_perm   = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm);
        __m256  tmp10_srl   = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
        __m256  vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
        __m256  vs11_perm   = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm);
        __m256  tmp11_srl   = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
        __m256  vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
        __m256  vs12_perm   = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm);
        __m256  tmp12_srl   = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
        __m256  vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
        __m256  vs13_perm   = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm);
        __m256  tmp13_srl   = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
        __m256  vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
        __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
        __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
        __m256         s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1));
        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1));
        s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0));
        s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4));
        __lsx_vst(*(__m128*)&s0, dst + i, 0);
        __lsx_vst(*(__m128*)&s1, dst + i, 4*4);
    }
    float temp = 0.f;
    for( ; i < nvecs; i++ )
    {
        const float* wptr = weights + i*wstep;
        __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
            __m256 v = (__m256)__lasx_xvld(vec + k, 0);
            vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
        }
        __m256i vs0_perm   = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
        __m256  vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm);
        __m256i tmp_srl    = __lasx_xvsrli_d(vs0_add_2w, 32);
        __m256  vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl);
        temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4];
        dst[i] = temp + bias[i];
    }
 }
 void fastGEMM( const float* aptr, size_t astep, const float* bptr,
               size_t bstep, float* cptr, size_t cstep,
               int ma, int na, int nb )
 {
    int n = 0;
    for( ; n <= nb - 16; n += 16 )
    {
        for( int m = 0; m < ma; m += 4 )
        {
            const float* aptr0 = aptr + astep*m;
            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
            float* cptr0 = cptr + cstep*m;
            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
            __m256i v256_tmp;
            __m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
            __m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
            __m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
            __m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
            for( int k = 0; k < na; k++ )
            {
                __m256 a0 = _v256_setall_ps(aptr0[k]);
                __m256 a1 = _v256_setall_ps(aptr1[k]);
                __m256 a2 = _v256_setall_ps(aptr2[k]);
                __m256 a3 = _v256_setall_ps(aptr3[k]);
                __m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0);
                __m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0);
                d00 = __lasx_xvfmadd_s(a0, b0, d00);
                d01 = __lasx_xvfmadd_s(a0, b1, d01);
                d10 = __lasx_xvfmadd_s(a1, b0, d10);
                d11 = __lasx_xvfmadd_s(a1, b1, d11);
                d20 = __lasx_xvfmadd_s(a2, b0, d20);
                d21 = __lasx_xvfmadd_s(a2, b1, d21);
                d30 = __lasx_xvfmadd_s(a3, b0, d30);
                d31 = __lasx_xvfmadd_s(a3, b1, d31);
            }
            __lasx_xvst(d00, cptr0 + n, 0);
            __lasx_xvst(d01, cptr0 + n, 8*4);
            __lasx_xvst(d10, cptr1 + n, 0);
            __lasx_xvst(d11, cptr1 + n, 8*4);
            __lasx_xvst(d20, cptr2 + n, 0);
            __lasx_xvst(d21, cptr2 + n, 8*4);
            __lasx_xvst(d30, cptr3 + n, 0);
            __lasx_xvst(d31, cptr3 + n, 8*4);
        }
    }
    for( ; n < nb; n++ )
    {
        for( int m = 0; m < ma; m++ )
        {
            const float* aptr0 = aptr + astep*m;
            float* cptr0 = cptr + cstep*m;
            float d0 = 0.f;
            for( int k = 0; k < na; k++ )
                d0 += aptr0[k]*bptr[k*bstep + n];
            cptr0[n] = d0;
        }
    }
 }
 #endif // CV_LASX
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -2178,6 +2178,9 @@ public:
    #if CV_TRY_SSE4_1
        bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
    #endif
    #if CV_TRY_LASX
        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
    #endif
        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2241,6 +2244,10 @@ public:
                        if ( useAVX2 )
                            x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
                        #endif
                        #if CV_TRY_LASX
                        if ( useLASX )
                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
                        #endif
                        #if CV_SIMD128
                        {
                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
 #endif
 }
 namespace opt_LASX
 {
 #if CV_TRY_LASX
 int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
 #endif
 }
 namespace opt_SSE4_1
 {
 #if CV_TRY_SSE4_1
--- a/modules/imgproc/src/imgwarp.lasx.cpp
+++ b/modules/imgproc/src/imgwarp.lasx.cpp
@ -0,0 +1,98 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /* ////////////////////////////////////////////////////////////////////
 //
 //  Geometrical transforms on images and matrices: rotation, zoom etc.
 //
 // */
 #include "precomp.hpp"
 #include "imgwarp.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 namespace cv
 {
 namespace opt_LASX
 {
 int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
 {
    const int AB_BITS = MAX(10, (int)INTER_BITS);
    int x1 = 0;
    __m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1);
    __m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0);
    for (; x1 <= bw - 16; x1 += 16)
    {
        __m256i tx0, tx1, ty0, ty1;
        tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX);
        ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY);
        tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX);
        ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY);
        tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS);
        ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS);
        tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS);
        ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS);
        __m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask),
            __lasx_xvand_v(tx1, fxy_mask));
        __m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask),
            __lasx_xvand_v(ty1, fxy_mask));
        tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS),
            __lasx_xvsrai_w(tx1, INTER_BITS));
        ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS),
            __lasx_xvsrai_w(ty1, INTER_BITS));
        fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS));
        fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
        __lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0);
        __lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2);
        __lasx_xvst(fx_, (__m256i*)(alpha + x1), 0);
    }
    return x1;
 }
 }
 }
 /* End of file. */
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
            opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
    }
    else
 #endif
 #if CV_TRY_LASX
    if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4)))
    {
        if(pix_size == 2)
            opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify);
        else
            opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify);
    }
    else
 #endif
    {
        resizeNNInvoker invoker(src, dst, x_ofs, ify);
--- a/modules/imgproc/src/resize.hpp
+++ b/modules/imgproc/src/resize.hpp
@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double);
 int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width);
 #endif
 }
 namespace opt_LASX
 {
 #if CV_TRY_LASX
 void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double);
 void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double);
 #endif
 }
 }
 #endif
 /* End of file. */
--- a/modules/imgproc/src/resize.lasx.cpp
+++ b/modules/imgproc/src/resize.lasx.cpp
@ -0,0 +1,249 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /* ////////////////////////////////////////////////////////////////////
 //
 //  Geometrical transforms on images and matrices: rotation, zoom etc.
 //
 // */
 #include "precomp.hpp"
 #include "resize.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 namespace cv
 {
 namespace opt_LASX
 {
 class resizeNNInvokerLASX4 CV_FINAL :
    public ParallelLoopBody
 {
 public:
    resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
        ify(_ify)
    {
    }
    virtual void operator() (const Range& range) const CV_OVERRIDE
    {
        Size ssize = src.size(), dsize = dst.size();
        int y, x;
        int width = dsize.width;
        int avxWidth = width - (width & 0x7);
        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
        {
            for(y = range.start; y < range.end; y++)
            {
                uchar* D = dst.data + dst.step*y;
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
 #ifdef CV_ICC
 #pragma unroll(4)
 #endif
                for(x = 0; x < avxWidth; x += 8)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
                    __lasx_xvst(pixels, (int*)D, 0);
                    D += 32;
                }
                for(; x < width; x++)
                {
                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
                }
            }
        }
        else
        {
            for(y = range.start; y < range.end; y++)
            {
                uchar* D = dst.data + dst.step*y;
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
 #ifdef CV_ICC
 #pragma unroll(4)
 #endif
                for(x = 0; x < avxWidth; x += 8)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
                    __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
                    __lasx_xvst(pixels, (int*)D, 0);
                    D += 32;
                }
                for(; x < width; x++)
                {
                    *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
                }
            }
        }
    }
 private:
    const Mat& src;
    Mat& dst;
    int* x_ofs;
    double ify;
    resizeNNInvokerLASX4(const resizeNNInvokerLASX4&);
    resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&);
 };
 class resizeNNInvokerLASX2 CV_FINAL :
    public ParallelLoopBody
 {
 public:
    resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
        ify(_ify)
    {
    }
    virtual void operator() (const Range& range) const CV_OVERRIDE
    {
        Size ssize = src.size(), dsize = dst.size();
        int y, x;
        int width = dsize.width;
        int avxWidth = width - (width & 0xf);
        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
                                                                     15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
        const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0);
        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
        {
            for(y = range.start; y < range.end; y++)
            {
                uchar* D = dst.data + dst.step*y;
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
                const uchar* S2 = S - 2;
 #ifdef CV_ICC
 #pragma unroll(4)
 #endif
                for(x = 0; x < avxWidth; x += 16)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
                    __lasx_xvst(ints_permuted, (int*)D, 0);
                    D += 32;
                }
                for(; x < width; x++)
                {
                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
                }
            }
        }
        else
        {
            for(y = range.start; y < range.end; y++)
            {
                uchar* D = dst.data + dst.step*y;
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
                const uchar* S2 = S - 2;
 #ifdef CV_ICC
 #pragma unroll(4)
 #endif
                for(x = 0; x < avxWidth; x += 16)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
                    __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
                    const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
                    __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
                    const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
                    __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
                    __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
                    __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
                    __lasx_xvst(ints_permuted, (int*)D, 0);
                    D += 32;
                }
                for(; x < width; x++)
                {
                    *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
                }
            }
        }
    }
 private:
    const Mat& src;
    Mat& dst;
    int* x_ofs;
    double ify;
    resizeNNInvokerLASX2(const resizeNNInvokerLASX2&);
    resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&);
 };
 void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
 {
    resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify);
    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
 }
 void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
 {
    resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify);
    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
 }
 }
 }
 /* End of file. */