Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX

* Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX
* Add resize.lasx.cpp for Loongson SIMD acceleration
* Add imgwarp.lasx.cpp for Loongson SIMD acceleration
* Add LASX acceleration support for dnn/conv
* Add CV_PAUSE(v) for Loongarch
* Set LASX by default on Loongarch64
* LoongArch: tune test threshold for Core/HAL.mat_decomp/15

Co-authored-by: shengwenxue <shengwenxue@loongson.cn>
pull/22495/head
wxsheng 2 years ago committed by GitHub
parent 866191478f
commit 4154bd0667
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      cmake/OpenCVCompilerOptimizations.cmake
  2. 2
      cmake/OpenCVDetectCXXCompiler.cmake
  3. 23
      cmake/checks/cpu_lasx.cpp
  4. 9
      modules/core/include/opencv2/core/cv_cpu_dispatch.h
  5. 21
      modules/core/include/opencv2/core/cv_cpu_helper.h
  6. 4
      modules/core/include/opencv2/core/cvdef.h
  7. 16
      modules/core/include/opencv2/core/hal/intrin.hpp
  8. 3236
      modules/core/include/opencv2/core/hal/intrin_lasx.hpp
  9. 2
      modules/core/src/parallel_impl.cpp
  10. 6
      modules/core/src/system.cpp
  11. 4
      modules/core/test/test_hal_core.cpp
  12. 4
      modules/dnn/CMakeLists.txt
  13. 18
      modules/dnn/src/int8layers/convolution_layer.cpp
  14. 9
      modules/dnn/src/int8layers/fully_connected_layer.cpp
  15. 624
      modules/dnn/src/int8layers/layers_common.simd.hpp
  16. 24
      modules/dnn/src/layers/convolution_layer.cpp
  17. 9
      modules/dnn/src/layers/fully_connected_layer.cpp
  18. 679
      modules/dnn/src/layers/layers_common.simd.hpp
  19. 7
      modules/imgproc/src/imgwarp.cpp
  20. 7
      modules/imgproc/src/imgwarp.hpp
  21. 98
      modules/imgproc/src/imgwarp.lasx.cpp
  22. 10
      modules/imgproc/src/resize.cpp
  23. 9
      modules/imgproc/src/resize.hpp
  24. 249
      modules/imgproc/src/resize.lasx.cpp

@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(APPEND CPU_ALL_OPTIMIZATIONS RVV) list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
ocv_update(CPU_VFPV3_FEATURE_ALIAS "") ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@ -380,6 +381,12 @@ elseif(RISCV)
set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}") set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}")
set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}") set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(LOONGARCH64)
ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
endif() endif()
# Helper values for cmake-gui # Helper values for cmake-gui

@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
set(MIPS 1) set(MIPS 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)")
set(RISCV 1) set(RISCV 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)")
set(LOONGARCH64 1)
else() else()
if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR) if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR)
message(WARNING "OpenCV: unrecognized target processor configuration") message(WARNING "OpenCV: unrecognized target processor configuration")

@ -0,0 +1,23 @@
#include <stdio.h>
#if defined(__loongarch_asx)
# include <lasxintrin.h>
# define CV_LASX 1
#endif
#if defined CV_LASX
int test()
{
const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0);
return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7);
}
#else
#error "LASX is not supported"
#endif
int main()
{
printf("%d\n", test());
return 0;
}

@ -172,6 +172,11 @@
# define CV_MSA 1 # define CV_MSA 1
#endif #endif
#ifdef CV_CPU_COMPILE_LASX
# include <lasxintrin.h>
# define CV_LASX 1
#endif
#ifdef __EMSCRIPTEN__ #ifdef __EMSCRIPTEN__
# define CV_WASM_SIMD 1 # define CV_WASM_SIMD 1
# include <wasm_simd128.h> # include <wasm_simd128.h>
@ -370,3 +375,7 @@ struct VZeroUpperGuard {
#ifndef CV_RVV #ifndef CV_RVV
# define CV_RVV 0 # define CV_RVV 0
#endif #endif
#ifndef CV_LASX
# define CV_LASX 0
#endif

@ -525,5 +525,26 @@
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
# define CV_TRY_LASX 1
# define CV_CPU_FORCE_LASX 1
# define CV_CPU_HAS_SUPPORT_LASX 1
# define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
# define CV_TRY_LASX 1
# define CV_CPU_FORCE_LASX 0
# define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
# define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
# define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
#else
# define CV_TRY_LASX 0
# define CV_CPU_FORCE_LASX 0
# define CV_CPU_HAS_SUPPORT_LASX 0
# define CV_CPU_CALL_LASX(fn, args)
# define CV_CPU_CALL_LASX_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...) CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */

@ -279,6 +279,8 @@ namespace cv {
#define CV_CPU_RVV 210 #define CV_CPU_RVV 210
#define CV_CPU_LASX 230
// CPU features groups // CPU features groups
#define CV_CPU_AVX512_SKX 256 #define CV_CPU_AVX512_SKX 256
#define CV_CPU_AVX512_COMMON 257 #define CV_CPU_AVX512_COMMON 257
@ -336,6 +338,8 @@ enum CpuFeatures {
CPU_RVV = 210, CPU_RVV = 210,
CPU_LASX = 230,
CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512 CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF

@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE) #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
#include "opencv2/core/hal/intrin_rvv.hpp" #include "opencv2/core/hal/intrin_rvv.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
#include "opencv2/core/hal/intrin_rvv_scalable.hpp" #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#elif CV_LASX
#if !defined(CV_FORCE_SIMD128_CPP)
#define CV_FORCE_SIMD128_CPP 1
#endif
#include "opencv2/core/hal/intrin_cpp.hpp"
#else #else
#include "opencv2/core/hal/intrin_cpp.hpp" #include "opencv2/core/hal/intrin_cpp.hpp"
@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif #endif
#if CV_LASX
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_lasx.hpp"
#endif
//! @cond IGNORED //! @cond IGNORED
namespace cv { namespace cv {

File diff suppressed because it is too large Load Diff

@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
// https://github.com/riscv/riscv-isa-manual/issues/43 // https://github.com/riscv/riscv-isa-manual/issues/43
// # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0) // # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0)
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0) # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
# elif defined __GNUC__ && defined __loongarch__
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
# else # else
# warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags." # warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags."
# define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0) # define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0)

@ -434,6 +434,8 @@ struct HWFeatures
g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL"; g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
g_hwFeatureNames[CPU_RVV] = "RVV"; g_hwFeatureNames[CPU_RVV] = "RVV";
g_hwFeatureNames[CPU_LASX] = "LASX";
} }
void initialize(void) void initialize(void)
@ -689,6 +691,10 @@ struct HWFeatures
have[CV_CPU_RVV] = true; have[CV_CPU_RVV] = true;
#endif #endif
#if defined __loongarch_asx
have[CV_CPU_LASX] = true;
#endif
bool skip_baseline_check = false; bool skip_baseline_check = false;
#ifndef NO_GETENV #ifndef NO_GETENV
if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK")) if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK"))

@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp)
int size = (hcase / 2) % 4; int size = (hcase / 2) % 4;
size = size == 0 ? 3 : size == 1 ? 4 : size == 2 ? 6 : 15; size = size == 0 ? 3 : size == 1 ? 4 : size == 2 ? 6 : 15;
int nfunc = (hcase / 8); int nfunc = (hcase / 8);
#if CV_LASX
double eps = depth == CV_32F ? 1e-5 : 2e-10;
#else
double eps = depth == CV_32F ? 1e-5 : 1e-10; double eps = depth == CV_32F ? 1e-5 : 1e-10;
#endif
if( size == 3 ) if( size == 3 )
return; // TODO ??? return; // TODO ???

@ -8,8 +8,8 @@ endif()
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV) ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX) ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)

@ -579,13 +579,14 @@ public:
bool is1x1_; bool is1x1_;
bool useAVX2; bool useAVX2;
bool useAVX512; bool useAVX512;
bool useLASX;
int blk_size_cn; int blk_size_cn;
int inpZp, outZp; int inpZp, outZp;
const std::vector<float>* multiplier; const std::vector<float>* multiplier;
ParallelConv() ParallelConv()
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false) biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0) , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
{} {}
@ -641,6 +642,8 @@ public:
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
int kernel_d = isConv3D? kernel_size[0] : 1; int kernel_d = isConv3D? kernel_size[0] : 1;
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back(); int kernel_w = kernel_size.back();
@ -837,6 +840,13 @@ public:
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else
#endif #endif
{ {
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1210,6 +1220,12 @@ public:
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif #endif
for( int i = 0; i < outCn; i += 2 ) for( int i = 0; i < outCn; i += 2 )
{ {

@ -226,7 +226,7 @@ public:
{ {
public: public:
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0), FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {} dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier, static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp) const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@ -250,6 +250,7 @@ public:
p.activ = !activationLUT.empty() ? activ : 0; p.activ = !activationLUT.empty() ? activ : 0;
p.useAVX2 = checkHardwareSupport(CPU_AVX2); p.useAVX2 = checkHardwareSupport(CPU_AVX2);
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
p.useLASX = checkHardwareSupport(CPU_LASX);
parallel_for_(Range(0, nstripes), p, nstripes); parallel_for_(Range(0, nstripes), p, nstripes);
} }
@ -294,6 +295,11 @@ public:
if( useAVX2 ) if( useAVX2 )
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else else
#endif
#if CV_TRY_LASX
if( useLASX )
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif #endif
{ {
int i = 0; int i = 0;
@ -349,6 +355,7 @@ public:
int nstripes, outZp; int nstripes, outZp;
bool useAVX2; bool useAVX2;
bool useAVX512; bool useAVX512;
bool useLASX;
}; };
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE

@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
} }
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c)
{
__m256i vzero = __lasx_xvreplgr2vr_d(0);
__m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b);
__m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b);
__m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16);
__m256i odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16);
return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c);
}
enum { FASCONV_BASE_VECSZ = 4 };
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
const int8_t* rowbuf, int* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, int outZp,
const float* multiplier, bool initOutput, bool finalOutput )
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2]*outShape[3];
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
int rsz = blockSize % FASCONV_BASE_VECSZ;
for( int i = 0; i < rsz; i++ )
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
__m128i mask = __lsx_vld((const float*)maskbuf, 0);
// now compute dot product of the weights
// and im2row-transformed part of the tensor
for( int i = 0; i < outCn; i += 3 )
{
const int8_t* wptr0 = weights + i*wstep;
const int8_t* wptr1 = wptr0 + wstep;
const int8_t* wptr2 = wptr1 + wstep;
int* outptr0 = output + i*outPlaneSize;
int* outptr1 = outptr0 + outPlaneSize;
int* outptr2 = outptr1 + outPlaneSize;
int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
if( i+2 >= outCn )
{
wptr2 = wptr1;
outptr2 = outptr1;
bias2 = bias1;
mult2 = mult1;
if( i+1 >= outCn )
{
wptr2 = wptr1 = wptr0;
outptr2 = outptr1 = outptr0;
bias2 = bias1 = bias0;
mult2 = mult1 = mult0;
}
}
int j = 0;
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
{
bool tail = false;
if (j + FASCONV_BASE_VECSZ > blockSize)
{
if (j == 0)
break;
j = blockSize - FASCONV_BASE_VECSZ;
tail = true;
}
int k = 0;
const int8_t* rptr = rowbuf + j*vecsize_aligned;
__m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0),
vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0),
vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0),
vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0),
vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0),
vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0);
for (; k < vecsize; k += 32, rptr += 32 )
{
__m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0);
__m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0);
__m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0);
__m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0);
vs00 = _v256_fmadds8_s32(w0, r0, vs00);
vs10 = _v256_fmadds8_s32(w1, r0, vs10);
vs20 = _v256_fmadds8_s32(w2, r0, vs20);
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0);
vs01 = _v256_fmadds8_s32(w0, r0, vs01);
vs11 = _v256_fmadds8_s32(w1, r0, vs11);
vs21 = _v256_fmadds8_s32(w2, r0, vs21);
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0);
vs02 = _v256_fmadds8_s32(w0, r0, vs02);
vs12 = _v256_fmadds8_s32(w1, r0, vs12);
vs22 = _v256_fmadds8_s32(w2, r0, vs22);
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0);
vs03 = _v256_fmadds8_s32(w0, r0, vs03);
vs13 = _v256_fmadds8_s32(w1, r0, vs13);
vs23 = _v256_fmadds8_s32(w2, r0, vs23);
}
/*t0*/
__m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00);
__m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w);
__m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01);
__m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w);
__m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02);
__m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w);
__m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03);
__m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w);
__m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d);
__m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d);
__m256i t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00);
/*t1*/
__m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10);
__m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w);
__m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11);
__m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w);
__m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12);
__m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w);
__m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13);
__m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w);
__m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d);
__m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d);
__m256i t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10);
/*t2*/
__m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20);
__m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w);
__m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21);
__m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w);
__m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22);
__m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w);
__m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23);
__m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w);
__m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d);
__m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d);
__m256i t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20);
t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1));
t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1));
t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1));
__m128i s0, s1, s2;
if( initOutput )
{
s0 = __lsx_vreplgr2vr_w(bias0);
s1 = __lsx_vreplgr2vr_w(bias1);
s2 = __lsx_vreplgr2vr_w(bias2);
}
else
{
s0 = __lsx_vld((__m128i*)(outptr0 + j), 0);
s1 = __lsx_vld((__m128i*)(outptr1 + j), 0);
s2 = __lsx_vld((__m128i*)(outptr2 + j), 0);
}
s0 = __lsx_vadd_w(s0, *(__m128i*)&t0);
s1 = __lsx_vadd_w(s1, *(__m128i*)&t1);
s2 = __lsx_vadd_w(s2, *(__m128i*)&t2);
if( finalOutput )
{
__m128i voutzp = __lsx_vreplgr2vr_w(outZp);
__m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
__m256 v_mult0 = _v256_setall_ps(mult0);
__m256 v_mult1 = _v256_setall_ps(mult1);
__m256 v_mult2 = _v256_setall_ps(mult2);
s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0)));
s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1)));
s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2)));
s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax);
s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax);
s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax);
}
if( tail )
{
s0 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0), s0, mask);
s1 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0), s1, mask);
s2 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0), s2, mask);
}
__lsx_vst(s0, (__m128i*)(outptr0 + j), 0);
__lsx_vst(s1, (__m128i*)(outptr1 + j), 0);
__lsx_vst(s2, (__m128i*)(outptr2 + j), 0);
}
for( ; j <= blockSize - 2; j += 2 )
{
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
int s00, s01, s10, s11, s20, s21;
if( initOutput )
{
s00 = s01 = bias0;
s10 = s11 = bias1;
s20 = s21 = bias2;
}
else
{
s00 = outptr0[j]; s01 = outptr0[j+1];
s10 = outptr1[j]; s11 = outptr1[j+1];
s20 = outptr2[j]; s21 = outptr2[j+1];
}
for( int k = 0; k < vecsize; k++ )
{
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
int8_t r = rptr0[k];
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
r = rptr1[k];
s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
}
if( finalOutput )
{
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
}
outptr0[j] = s00;
outptr0[j+1] = s01;
outptr1[j] = s10;
outptr1[j+1] = s11;
outptr2[j] = s20;
outptr2[j+1] = s21;
}
for( ; j < blockSize; j++ )
{
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
int s00, s10, s20;
if( initOutput )
{
s00 = bias0;
s10 = bias1;
s20 = bias2;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
s20 = outptr2[j];
}
for( int k = 0; k < vecsize; k++ )
{
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
int8_t r = rptr0[k];
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
}
if( finalOutput )
{
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
}
outptr0[j] = s00;
outptr1[j] = s10;
outptr2[j] = s20;
}
}
}
static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b,
__m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
{
__m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0);
__m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0);
__m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0);
__m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0);
__m256i a0b0 = __lasx_xvmul_h(a0, b0);
__m256i a1b1 = __lasx_xvmul_h(a1, b1);
out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0));
out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0));
out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0));
out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0));
}
static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
{
__m256i t0 = __lasx_xvld((const __m256i*)ptr, 0);
__m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1);
const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = __lasx_xvshuf_b(t0, t0, sh);
__m256i p1 = __lasx_xvshuf_b(t1, t1, sh);
__m256i lo = __lasx_xvpermi_q(p0, p1, 0x02);
__m256i hi = __lasx_xvpermi_q(p0, p1, 0x13);
a = __lasx_xvilvl_d(hi, lo);
b = __lasx_xvilvh_d(hi, lo);
}
void fastDepthwiseConv( const int8_t* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const int* biasptr, const float* multptr,
const int8_t* inptr_,
int height, int width,
int* outptr_,
int out_d, int outH, int outW,
int inpZp, int outZp)
{
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float mult = multptr[out_d];
int bias = biasptr[out_d];
int biasCopy;
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const int8_t* imgptr0 = inptr_ + in_i*width;
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
int out;
biasCopy = bias;
if (in_i < 0)
{
biasCopy += inpZp * (w00 + w01 + w02);
w00 = w01 = w02 = 0;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
biasCopy += inpZp * (w20 + w21 + w22);
w20 = w21 = w22 = 0;
imgptr2 = imgptr1;
}
int* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
biasCopy + inpZp*(w00 + w10 + w20);
outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 32;
__m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02),
vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12),
vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22);
__m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp),
outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127);
__m256 vmult = _v256_setall_ps(mult);
__m256i vout0, vout1, vout2, vout3;
if( stride_w == 1 )
{
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - VECSZ;
}
int in_j = out_j * stride_w - pad_l;
__m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0),
v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0),
v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0),
v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0),
v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0),
v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0),
v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0),
v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0),
v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0);
vout0 = vout1 = vout2 = vout3 = vbias;
_v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
__lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
__lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
__lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
__lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
}
}
else
{
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - VECSZ;
}
int in_j = out_j * stride_w - pad_l;
__m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
vout0 = vout1 = vout2 = vout3 = vbias;
_v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
_v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
__lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
__lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
__lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
__lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
}
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
int s0 = 1, s1 = 1, s2 = 1;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0;
biasCopy += inpZp*(w00 + w10 + w20);
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0;
biasCopy += inpZp*(w01 + w11 + w21);
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0;
biasCopy += inpZp*(w02 + w12 + w22);
}
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
}
}
}
// dst = vec * weights^t + bias
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
size_t wstep, const int* bias, const float* multiplier,
int* dst, int nvecs, int vecsize, int outZp )
{
int i = 0;
for( ; i <= nvecs - 8; i += 8 )
{
const int8_t* wptr = weights + i*wstep;
__m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0),
vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0),
vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0),
vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0);
__m128i voutzp = __lsx_vreplgr2vr_w(outZp);
__m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
{
__m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1);
vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2);
vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3);
vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4);
vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5);
vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6);
vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7);
}
/*s0*/
__m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
__m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w);
__m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1);
__m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w);
__m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2);
__m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w);
__m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3);
__m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w);
__m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d);
__m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d);
__m256i s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0);
/*s1*/
__m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4);
__m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w);
__m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5);
__m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w);
__m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6);
__m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w);
__m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7);
__m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w);
__m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d);
__m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d);
__m256i s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4);
s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1));
s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1));
__m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0));
__m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4));
t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0))));
t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4))));
t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax);
t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax);
__lsx_vst(t0, (__m128i*)(dst + i), 0);
__lsx_vst(t1, (__m128i*)(dst + i), 4*4);
}
for( ; i < nvecs; i++ )
{
const int8_t* wptr = weights + i*wstep;
__m256i vs0 = __lasx_xvreplgr2vr_d(0);
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
{
__m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
}
__m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3];
dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
}
}
#endif // CV_LASX
CV_CPU_OPTIMIZATION_NAMESPACE_END CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace }} // namespace

@ -986,12 +986,13 @@ public:
bool useAVX2; bool useAVX2;
bool useAVX512; bool useAVX512;
bool useRVV; bool useRVV;
bool useLASX;
int blk_size_cn; int blk_size_cn;
ParallelConv() ParallelConv()
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
, blk_size_cn(0) , useLASX(false), blk_size_cn(0)
{} {}
static void run( const Mat& input, Mat& output, const Mat& weights, static void run( const Mat& input, Mat& output, const Mat& weights,
@ -1049,6 +1050,7 @@ public:
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D; p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D;
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
int kernel_d = isConv3D? kernel_size[0] : 1; int kernel_d = isConv3D? kernel_size[0] : 1;
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@ -1256,6 +1258,13 @@ public:
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else
#endif #endif
{ {
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1631,6 +1640,12 @@ public:
opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, relu, cn0 == 0); outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
else else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
else
#endif #endif
for( int i = 0; i < outCn; i += 2 ) for( int i = 0; i < outCn; i += 2 )
{ {
@ -2437,6 +2452,7 @@ public:
useAVX2 = checkHardwareSupport(CPU_AVX2); useAVX2 = checkHardwareSupport(CPU_AVX2);
useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
useRVV = checkHardwareSupport(CPU_RVV); useRVV = checkHardwareSupport(CPU_RVV);
useLASX = checkHardwareSupport(CPU_LASX);
} }
void operator()(const Range& range_) const CV_OVERRIDE void operator()(const Range& range_) const CV_OVERRIDE
@ -2474,6 +2490,11 @@ public:
opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
} }
else else
#endif
#if CV_TRY_LASX
if( useLASX )
opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
else
#endif #endif
for( m = 0; m < mmax; m += 2 ) for( m = 0; m < mmax; m += 2 )
{ {
@ -2574,6 +2595,7 @@ public:
bool useAVX2; bool useAVX2;
bool useAVX512; bool useAVX512;
bool useRVV; bool useRVV;
bool useLASX;
}; };
class Col2ImInvoker : public cv::ParallelLoopBody class Col2ImInvoker : public cv::ParallelLoopBody

@ -173,7 +173,7 @@ public:
class FullyConnected : public ParallelLoopBody class FullyConnected : public ParallelLoopBody
{ {
public: public:
FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {} FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {}
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
Mat& dstMat, const ActivationLayer* activ, int nstripes) Mat& dstMat, const ActivationLayer* activ, int nstripes)
@ -197,6 +197,7 @@ public:
p.useAVX2 = checkHardwareSupport(CPU_AVX2); p.useAVX2 = checkHardwareSupport(CPU_AVX2);
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
p.useRVV = checkHardwareSupport(CPU_RVV); p.useRVV = checkHardwareSupport(CPU_RVV);
p.useLASX = checkHardwareSupport(CPU_LASX);
parallel_for_(Range(0, nstripes), p, nstripes); parallel_for_(Range(0, nstripes), p, nstripes);
} }
@ -250,6 +251,11 @@ public:
if( useRVV ) if( useRVV )
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else else
#endif
#if CV_TRY_LASX
if( useLASX )
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else
#endif #endif
{ {
int i = 0; int i = 0;
@ -305,6 +311,7 @@ public:
bool useAVX2; bool useAVX2;
bool useAVX512; bool useAVX512;
bool useRVV; bool useRVV;
bool useLASX;
}; };
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL

@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr,
#endif // CV_RVV #endif // CV_RVV
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
enum { FASCONV_BASE_VECSZ = 4 };
void fastConv( const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned,
const float* relu, bool initOutput )
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2]*outShape[3];
float r0 = 1.f, r1 = 1.f, r2 = 1.f;
__m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f);
__m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2;
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
int rsz = blockSize % FASCONV_BASE_VECSZ;
for( int i = 0; i < rsz; i++ )
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
__m128i mask = __lsx_vld((const float*)maskbuf, 0);
// now compute dot product of the weights
// and im2row-transformed part of the tensor
for( int i = 0; i < outCn; i += 3 )
{
const float* wptr0 = weights + i*wstep;
const float* wptr1 = wptr0 + wstep;
const float* wptr2 = wptr1 + wstep;
float* outptr0 = output + i*outPlaneSize;
float* outptr1 = outptr0 + outPlaneSize;
float* outptr2 = outptr1 + outPlaneSize;
float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
if( i+2 >= outCn )
{
wptr2 = wptr1;
outptr2 = outptr1;
bias2 = bias1;
if( i+1 >= outCn )
{
wptr2 = wptr1 = wptr0;
outptr2 = outptr1 = outptr0;
bias2 = bias1 = bias0;
}
}
if( relu )
{
r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
if( i+2 >= outCn )
{
r2 = r1;
if( i+1 >= outCn )
r2 = r1 = r0;
}
vr0 = _v256_extract_low(_v256_setall_ps(r0));
vr1 = _v256_extract_low(_v256_setall_ps(r1));
vr2 = _v256_extract_low(_v256_setall_ps(r2));
}
int j = 0;
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
{
bool tail = false;
if (j + FASCONV_BASE_VECSZ > blockSize)
{
if (j == 0)
break;
j = blockSize - FASCONV_BASE_VECSZ;
tail = true;
}
int k = 0;
const float* rptr = rowbuf + j*vecsize_aligned;
__m256i tmp;
__m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp),
vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp),
vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp),
vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp),
vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp),
vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp);
for (; k < vecsize; k += 8, rptr += 8 )
{
__m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0);
__m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0);
__m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0);
__m256 r0 = (__m256)__lasx_xvld(rptr, 0);
vs00 = __lasx_xvfmadd_s(w0, r0, vs00);
vs10 = __lasx_xvfmadd_s(w1, r0, vs10);
vs20 = __lasx_xvfmadd_s(w2, r0, vs20);
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0);
vs01 = __lasx_xvfmadd_s(w0, r0, vs01);
vs11 = __lasx_xvfmadd_s(w1, r0, vs11);
vs21 = __lasx_xvfmadd_s(w2, r0, vs21);
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0);
vs02 = __lasx_xvfmadd_s(w0, r0, vs02);
vs12 = __lasx_xvfmadd_s(w1, r0, vs12);
vs22 = __lasx_xvfmadd_s(w2, r0, vs22);
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0);
vs03 = __lasx_xvfmadd_s(w0, r0, vs03);
vs13 = __lasx_xvfmadd_s(w1, r0, vs13);
vs23 = __lasx_xvfmadd_s(w2, r0, vs23);
}
/*t0*/
__m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm);
__m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
__m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
__m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm);
__m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
__m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
__m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm);
__m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
__m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
__m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm);
__m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
__m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
__m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
__m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
__m256 t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
/*t1*/
__m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm);
__m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
__m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
__m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm);
__m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
__m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
__m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm);
__m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
__m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
__m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm);
__m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
__m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
__m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
__m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
__m256 t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
/*t2*/
__m256 vs20_perm = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm);
__m256 tmp20_srl = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32);
__m256 vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl);
__m256 vs21_perm = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm);
__m256 tmp21_srl = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32);
__m256 vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl);
__m256 vs22_perm = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm);
__m256 tmp22_srl = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32);
__m256 vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl);
__m256 vs23_perm = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm);
__m256 tmp23_srl = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32);
__m256 vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl);
__m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w);
__m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w);
__m256 t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20);
t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1));
t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1));
t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1));
__m128 s0, s1, s2;
if( initOutput )
{
s0 = _v256_extract_low(_v256_setall_ps(bias0));
s1 = _v256_extract_low(_v256_setall_ps(bias1));
s2 = _v256_extract_low(_v256_setall_ps(bias2));
}
else
{
s0 = (__m128)__lsx_vld(outptr0 + j, 0);
s1 = (__m128)__lsx_vld(outptr1 + j, 0);
s2 = (__m128)__lsx_vld(outptr2 + j, 0);
}
s0 = __lsx_vfadd_s(s0, *(__m128*)&t0);
s1 = __lsx_vfadd_s(s1, *(__m128*)&t1);
s2 = __lsx_vfadd_s(s2, *(__m128*)&t2);
if( relu )
{
__m128i m0 = __lsx_vfcmp_clt_s(z, s0);
__m128i m1 = __lsx_vfcmp_clt_s(z, s1);
__m128i m2 = __lsx_vfcmp_clt_s(z, s2);
s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0);
s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1);
s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2);
}
if( tail )
{
s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask);
s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask);
s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask);
}
__lsx_vst(s0, outptr0 + j, 0);
__lsx_vst(s1, outptr1 + j, 0);
__lsx_vst(s2, outptr2 + j, 0);
}
for( ; j <= blockSize - 2; j += 2 )
{
const float* rptr0 = rowbuf + j*vecsize_aligned;
const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
float s00, s01, s10, s11, s20, s21;
if( initOutput )
{
s00 = s01 = bias0;
s10 = s11 = bias1;
s20 = s21 = bias2;
}
else
{
s00 = outptr0[j]; s01 = outptr0[j+1];
s10 = outptr1[j]; s11 = outptr1[j+1];
s20 = outptr2[j]; s21 = outptr2[j+1];
}
for( int k = 0; k < vecsize; k++ )
{
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
float r = rptr0[k];
s00 += w0*r; s10 += w1*r; s20 += w2*r;
r = rptr1[k];
s01 += w0*r; s11 += w1*r; s21 += w2*r;
}
if( relu )
{
s00 = s00 > 0.f ? s00 : s00*r0;
s01 = s01 > 0.f ? s01 : s01*r0;
s10 = s10 > 0.f ? s10 : s10*r1;
s11 = s11 > 0.f ? s11 : s11*r1;
s20 = s20 > 0.f ? s20 : s20*r2;
s21 = s21 > 0.f ? s21 : s21*r2;
}
outptr0[j] = s00;
outptr0[j+1] = s01;
outptr1[j] = s10;
outptr1[j+1] = s11;
outptr2[j] = s20;
outptr2[j+1] = s21;
}
for( ; j < blockSize; j++ )
{
const float* rptr0 = rowbuf + j*vecsize_aligned;
float s00, s10, s20;
if( initOutput )
{
s00 = bias0;
s10 = bias1;
s20 = bias2;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
s20 = outptr2[j];
}
for( int k = 0; k < vecsize; k++ )
{
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
float r = rptr0[k];
s00 += w0*r; s10 += w1*r; s20 += w2*r;
}
if( relu )
{
s00 = s00 > 0.f ? s00 : s00*r0;
s10 = s10 > 0.f ? s10 : s10*r1;
s20 = s20 > 0.f ? s20 : s20*r2;
}
outptr0[j] = s00;
outptr1[j] = s10;
outptr2[j] = s20;
}
}
}
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
// dst = vec * weights^t + bias
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize )
{
int i = 0;
__m256i v256_tmp;
for( ; i <= nvecs - 8; i += 8 )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = (__m256)__lasx_xvld(vec + k, 0);
vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1);
vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2);
vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3);
vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4);
vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5);
vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6);
vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7);
}
/*s0*/
__m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm);
__m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
__m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
__m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm);
__m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
__m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
__m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm);
__m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
__m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
__m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm);
__m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
__m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
__m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
__m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
__m256 s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
/*s1*/
__m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm);
__m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
__m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
__m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm);
__m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
__m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
__m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm);
__m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
__m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
__m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm);
__m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
__m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
__m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
__m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
__m256 s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1));
s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1));
s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0));
s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4));
__lsx_vst(*(__m128*)&s0, dst + i, 0);
__lsx_vst(*(__m128*)&s1, dst + i, 4*4);
}
float temp = 0.f;
for( ; i < nvecs; i++ )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = (__m256)__lasx_xvld(vec + k, 0);
vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
}
__m256i vs0_perm = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
__m256 vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm);
__m256i tmp_srl = __lasx_xvsrli_d(vs0_add_2w, 32);
__m256 vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl);
temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4];
dst[i] = temp + bias[i];
}
}
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb )
{
int n = 0;
for( ; n <= nb - 16; n += 16 )
{
for( int m = 0; m < ma; m += 4 )
{
const float* aptr0 = aptr + astep*m;
const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
float* cptr0 = cptr + cstep*m;
float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
__m256i v256_tmp;
__m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
__m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
__m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
__m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
for( int k = 0; k < na; k++ )
{
__m256 a0 = _v256_setall_ps(aptr0[k]);
__m256 a1 = _v256_setall_ps(aptr1[k]);
__m256 a2 = _v256_setall_ps(aptr2[k]);
__m256 a3 = _v256_setall_ps(aptr3[k]);
__m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0);
__m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0);
d00 = __lasx_xvfmadd_s(a0, b0, d00);
d01 = __lasx_xvfmadd_s(a0, b1, d01);
d10 = __lasx_xvfmadd_s(a1, b0, d10);
d11 = __lasx_xvfmadd_s(a1, b1, d11);
d20 = __lasx_xvfmadd_s(a2, b0, d20);
d21 = __lasx_xvfmadd_s(a2, b1, d21);
d30 = __lasx_xvfmadd_s(a3, b0, d30);
d31 = __lasx_xvfmadd_s(a3, b1, d31);
}
__lasx_xvst(d00, cptr0 + n, 0);
__lasx_xvst(d01, cptr0 + n, 8*4);
__lasx_xvst(d10, cptr1 + n, 0);
__lasx_xvst(d11, cptr1 + n, 8*4);
__lasx_xvst(d20, cptr2 + n, 0);
__lasx_xvst(d21, cptr2 + n, 8*4);
__lasx_xvst(d30, cptr3 + n, 0);
__lasx_xvst(d31, cptr3 + n, 8*4);
}
}
for( ; n < nb; n++ )
{
for( int m = 0; m < ma; m++ )
{
const float* aptr0 = aptr + astep*m;
float* cptr0 = cptr + cstep*m;
float d0 = 0.f;
for( int k = 0; k < na; k++ )
d0 += aptr0[k]*bptr[k*bstep + n];
cptr0[n] = d0;
}
}
}
#endif // CV_LASX
CV_CPU_OPTIMIZATION_NAMESPACE_END CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace }} // namespace

@ -2178,6 +2178,9 @@ public:
#if CV_TRY_SSE4_1 #if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
#endif #endif
#if CV_TRY_LASX
bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
#endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows); int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols); int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2241,6 +2244,10 @@ public:
if ( useAVX2 ) if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw); x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif #endif
#if CV_TRY_LASX
if ( useLASX )
x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_SIMD128 #if CV_SIMD128
{ {
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0); v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);

@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
#endif #endif
} }
namespace opt_LASX
{
#if CV_TRY_LASX
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
#endif
}
namespace opt_SSE4_1 namespace opt_SSE4_1
{ {
#if CV_TRY_SSE4_1 #if CV_TRY_SSE4_1

@ -0,0 +1,98 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/* ////////////////////////////////////////////////////////////////////
//
// Geometrical transforms on images and matrices: rotation, zoom etc.
//
// */
#include "precomp.hpp"
#include "imgwarp.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace cv
{
namespace opt_LASX
{
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
{
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
__m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1);
__m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0);
for (; x1 <= bw - 16; x1 += 16)
{
__m256i tx0, tx1, ty0, ty1;
tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX);
ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY);
tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX);
ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY);
tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS);
ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS);
tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS);
ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS);
__m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask),
__lasx_xvand_v(tx1, fxy_mask));
__m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask),
__lasx_xvand_v(ty1, fxy_mask));
tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS),
__lasx_xvsrai_w(tx1, INTER_BITS));
ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS),
__lasx_xvsrai_w(ty1, INTER_BITS));
fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS));
fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
__lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0);
__lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2);
__lasx_xvst(fx_, (__m256i*)(alpha + x1), 0);
}
return x1;
}
}
}
/* End of file. */

@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify); opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
} }
else else
#endif
#if CV_TRY_LASX
if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4)))
{
if(pix_size == 2)
opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify);
else
opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify);
}
else
#endif #endif
{ {
resizeNNInvoker invoker(src, dst, x_ofs, ify); resizeNNInvoker invoker(src, dst, x_ofs, ify);

@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double);
int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width); int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width);
#endif #endif
} }
namespace opt_LASX
{
#if CV_TRY_LASX
void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double);
void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double);
#endif
}
} }
#endif #endif
/* End of file. */ /* End of file. */

@ -0,0 +1,249 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/* ////////////////////////////////////////////////////////////////////
//
// Geometrical transforms on images and matrices: rotation, zoom etc.
//
// */
#include "precomp.hpp"
#include "resize.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace cv
{
namespace opt_LASX
{
class resizeNNInvokerLASX4 CV_FINAL :
public ParallelLoopBody
{
public:
resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
ify(_ify)
{
}
virtual void operator() (const Range& range) const CV_OVERRIDE
{
Size ssize = src.size(), dsize = dst.size();
int y, x;
int width = dsize.width;
int avxWidth = width - (width & 0x7);
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
{
for(y = range.start; y < range.end; y++)
{
uchar* D = dst.data + dst.step*y;
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
__m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
__lasx_xvst(pixels, (int*)D, 0);
D += 32;
}
for(; x < width; x++)
{
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
}
}
}
else
{
for(y = range.start; y < range.end; y++)
{
uchar* D = dst.data + dst.step*y;
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
__m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
__lasx_xvst(pixels, (int*)D, 0);
D += 32;
}
for(; x < width; x++)
{
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
}
}
}
}
private:
const Mat& src;
Mat& dst;
int* x_ofs;
double ify;
resizeNNInvokerLASX4(const resizeNNInvokerLASX4&);
resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&);
};
class resizeNNInvokerLASX2 CV_FINAL :
public ParallelLoopBody
{
public:
resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
ify(_ify)
{
}
virtual void operator() (const Range& range) const CV_OVERRIDE
{
Size ssize = src.size(), dsize = dst.size();
int y, x;
int width = dsize.width;
int avxWidth = width - (width & 0xf);
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0);
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
{
for(y = range.start; y < range.end; y++)
{
uchar* D = dst.data + dst.step*y;
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
__m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
__m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
__m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
__m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
__lasx_xvst(ints_permuted, (int*)D, 0);
D += 32;
}
for(; x < width; x++)
{
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
}
}
}
else
{
for(y = range.start; y < range.end; y++)
{
uchar* D = dst.data + dst.step*y;
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
__m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
__m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
__m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
__m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
__lasx_xvst(ints_permuted, (int*)D, 0);
D += 32;
}
for(; x < width; x++)
{
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
}
}
}
}
private:
const Mat& src;
Mat& dst;
int* x_ofs;
double ify;
resizeNNInvokerLASX2(const resizeNNInvokerLASX2&);
resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&);
};
void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
{
resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify);
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
}
void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
{
resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify);
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
}
}
}
/* End of file. */
Loading…
Cancel
Save