diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index cc0b5f8216..b1751e3987 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD) list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(APPEND CPU_ALL_OPTIMIZATIONS RVV) +list(APPEND CPU_ALL_OPTIMIZATIONS LASX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) ocv_update(CPU_VFPV3_FEATURE_ALIAS "") @@ -380,6 +381,12 @@ elseif(RISCV) set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}") set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}") +elseif(LOONGARCH64) + ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX") + ocv_update(CPU_LASX_FLAGS_ON "-mlasx") + set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}") + endif() # Helper values for cmake-gui diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index 7f229cde96..8fe89b3fe0 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)") set(MIPS 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)") set(RISCV 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)") + set(LOONGARCH64 1) else() if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR) message(WARNING "OpenCV: unrecognized target processor configuration") diff --git a/cmake/checks/cpu_lasx.cpp b/cmake/checks/cpu_lasx.cpp new file mode 100644 index 0000000000..9d3b2a8725 --- /dev/null +++ b/cmake/checks/cpu_lasx.cpp @@ -0,0 +1,23 @@ +#include + +#if defined(__loongarch_asx) +# include +# define CV_LASX 1 +#endif + +#if defined CV_LASX +int test() +{ + const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f }; + v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0); + return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7); +} +#else +#error "LASX is not supported" +#endif + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 12e4cb47b8..3235b6317e 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -172,6 +172,11 @@ # define CV_MSA 1 #endif +#ifdef CV_CPU_COMPILE_LASX +# include +# define CV_LASX 1 +#endif + #ifdef __EMSCRIPTEN__ # define CV_WASM_SIMD 1 # include @@ -370,3 +375,7 @@ struct VZeroUpperGuard { #ifndef CV_RVV # define CV_RVV 0 #endif + +#ifndef CV_LASX +# define CV_LASX 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 91b853de0c..41fc9d50fa 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -525,5 +525,26 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX +# define CV_TRY_LASX 1 +# define CV_CPU_FORCE_LASX 1 +# define CV_CPU_HAS_SUPPORT_LASX 1 +# define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX +# define CV_TRY_LASX 1 +# define CV_CPU_FORCE_LASX 0 +# define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX)) +# define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args) +# define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args) +#else +# define CV_TRY_LASX 0 +# define CV_CPU_FORCE_LASX 0 +# define CV_CPU_HAS_SUPPORT_LASX 0 +# define CV_CPU_CALL_LASX(fn, args) +# define CV_CPU_CALL_LASX_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...) CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 21e3792162..95dc81fb46 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -279,6 +279,8 @@ namespace cv { #define CV_CPU_RVV 210 +#define CV_CPU_LASX 230 + // CPU features groups #define CV_CPU_AVX512_SKX 256 #define CV_CPU_AVX512_COMMON 257 @@ -336,6 +338,8 @@ enum CpuFeatures { CPU_RVV = 210, + CPU_LASX = 230, + CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512 CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index c12140bbf8..6eac27e763 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE) #include "opencv2/core/hal/intrin_rvv.hpp" + #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE #include "opencv2/core/hal/intrin_rvv_scalable.hpp" + +#elif CV_LASX + #if !defined(CV_FORCE_SIMD128_CPP) + #define CV_FORCE_SIMD128_CPP 1 + #endif +#include "opencv2/core/hal/intrin_cpp.hpp" + #else #include "opencv2/core/hal/intrin_cpp.hpp" @@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #endif +#if CV_LASX + +#define CV__SIMD_FORWARD 256 +#include "opencv2/core/hal/intrin_forward.hpp" +#include "opencv2/core/hal/intrin_lasx.hpp" + +#endif + //! @cond IGNORED namespace cv { diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp new file mode 100644 index 0000000000..37f2e3f81d --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -0,0 +1,3236 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef OPENCV_HAL_INTRIN_LASX_HPP +#define OPENCV_HAL_INTRIN_LASX_HPP + +#include +#include + +#define CV_SIMD256 1 +#define CV_SIMD256_64F 1 +#define CV_SIMD256_FP16 0 + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +///////// Utils //////////// + +inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9, + char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19, + char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29, + char v30, char v31) +{ + return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, + v30, v31 }; +} + +inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9, + char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19, + char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29, + char v30, char v31) +{ + return (__m256i)v32i8{ v31, v30, + v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, + v19, v18, v17, v16, v15, v14, v13, v12, v11, v10, + v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 }; +} + +inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7, + short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15) +{ + return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 }; +} + +inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7) +{ + return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 }; +} + +inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7) +{ + return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 }; +} + +inline __m256i _v256_setall_w(int v0) +{ + return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 }; +} + +inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3) +{ + return (__m256i)v4i64{ v0, v1, v2, v3 }; +} + +inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3) +{ + return (__m256i)v4i64{ v3, v2, v1, v0 }; +} + +inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7) +{ + return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 }; +} + +inline __m256 _v256_setall_ps(float f32) +{ + return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 }; +} + +inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3) +{ + return (__m256d)v4f64{ v0, v1, v2, v3 }; +} + +inline __m256d _v256_setall_pd(double f64) +{ + return (__m256d)v4f64{ f64, f64, f64, f64 }; +} + +inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b) +{ + __m256i u8min = __lasx_xvreplgr2vr_h(0); + __m256i u8max = __lasx_xvreplgr2vr_h(255); + __m256i sat_a = __lasx_xvmax_h(a, u8min); + sat_a = __lasx_xvmin_h(sat_a, u8max); + __m256i sat_b = __lasx_xvmax_h(b, u8min); + sat_b = __lasx_xvmin_h(sat_b, u8max); + __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + return __lasx_xvshuf_b(sat_b, sat_a, byteIndex); +} + +inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b) +{ + __m256i s8min = __lasx_xvreplgr2vr_h(-128); + __m256i s8max = __lasx_xvreplgr2vr_h(127); + __m256i sat_a = __lasx_xvmax_h(a, s8min); + sat_a = __lasx_xvmin_h(sat_a, s8max); + __m256i sat_b = __lasx_xvmax_h(b, s8min); + sat_b = __lasx_xvmin_h(sat_b, s8max); + __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + return __lasx_xvshuf_b(sat_b, sat_a, byteIndex); +} + +inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b) +{ + __m256i u16min = __lasx_xvreplgr2vr_w(0); + __m256i u16max = __lasx_xvreplgr2vr_w(0xffff); + __m256i sat_a = __lasx_xvmax_w(a, u16min); + sat_a = __lasx_xvmin_w(sat_a, u16max); + __m256i sat_b = __lasx_xvmax_w(b, u16min); + sat_b = __lasx_xvmin_w(sat_b, u16max); + __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14); + return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a); +} + +inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b) +{ + __m256i s16min = __lasx_xvreplgr2vr_w(-0x8000); + __m256i s16max = __lasx_xvreplgr2vr_w(0x7fff); + __m256i sat_a = __lasx_xvmax_w(a, s16min); + sat_a = __lasx_xvmin_w(sat_a, s16max); + __m256i sat_b = __lasx_xvmax_w(b, s16min); + sat_b = __lasx_xvmin_w(sat_b, s16max); + __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14, + 0, 2, 4, 6, 8, 10, 12, 14); + return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a); +} + +inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi) +{ return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); } + +inline __m256 _v256_combine(const __m128& lo, const __m128& hi) +{ return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); } + +inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi) +{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); } + +inline __m256i _v256_shuffle_odd_64(const __m256i& v) +{ return __lasx_xvpermi_d(v, 0xd8); } + +inline __m256d _v256_shuffle_odd_64(const __m256d& v) +{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); } + +//LASX: only use for permute WITHOUT zero clearing +template +inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b) +{ return __lasx_xvpermi_q(a, b, imm); } + +template +inline __m256 _v256_permute2x128(const __m256& a, const __m256& b) +{ return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); } + +template +inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b) +{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); } + +template +inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(_v256_permute2x128(a.val, b.val)); } + +template +inline __m256i _v256_permute4x64(const __m256i& a) +{ return __lasx_xvpermi_d(a, imm); } + +template +inline __m256d _v256_permute4x64(const __m256d& a) +{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); } + +template +inline _Tpvec v256_permute4x64(const _Tpvec& a) +{ return _Tpvec(_v256_permute4x64(a.val)); } + +inline __m128i _v256_extract_high(const __m256i& v) +{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31); + return *((__m128i*)&temp256i); } + +inline __m128 _v256_extract_high(const __m256& v) +{ return __m128(_v256_extract_high(*((__m256i*)&v))); } + +inline __m128d _v256_extract_high(const __m256d& v) +{ return __m128d(_v256_extract_high(*((__m256i*)&v))); } + +inline __m128i _v256_extract_low(const __m256i& v) +{ return *((__m128i*)&v); } + +inline __m128 _v256_extract_low(const __m256& v) +{ return __m128(_v256_extract_low(*((__m256i*)&v))); } + +inline __m128d _v256_extract_low(const __m256d& v) +{ return __m128d(_v256_extract_low(*((__m256i*)&v))); } + +inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b) +{ + const __m256i maxv = __lasx_xvreplgr2vr_w(65535); + __m256i am = __lasx_xvmin_wu(a, maxv); + __m256i bm = __lasx_xvmin_wu(b, maxv); + return _lasx_packus_w(am, bm); +} + +template +inline int _v256_extract_b(const __m256i& a) +{ + int des[1] = {0}; + __lasx_xvstelm_b(a, des, 0, i); + return des[0]; +} + +template +inline int _v256_extract_h(const __m256i& a) +{ + int des[1] = {0}; + __lasx_xvstelm_h(a, des, 0, i); + return des[0]; +} + +template +inline int _v256_extract_w(const __m256i& a) +{ + return __lasx_xvpickve2gr_w(a, i); +} + +template +inline int64 _v256_extract_d(const __m256i& a) +{ + return __lasx_xvpickve2gr_d(a, i); +} + +///////// Types //////////// + +struct v_uint8x32 +{ + typedef uchar lane_type; + enum { nlanes = 32 }; + __m256i val; + + explicit v_uint8x32(__m256i v) : val(v) {} + v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3, + uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, + uchar v12, uchar v13, uchar v14, uchar v15, + uchar v16, uchar v17, uchar v18, uchar v19, + uchar v20, uchar v21, uchar v22, uchar v23, + uchar v24, uchar v25, uchar v26, uchar v27, + uchar v28, uchar v29, uchar v30, uchar v31) + { + val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9, + (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15, + (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21, + (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27, + (char)v28, (char)v29, (char)v30, (char)v31); + } + /* coverity[uninit_ctor]: suppress warning */ + v_uint8x32() {} + + uchar get0() const { + uchar des[1] = {0}; + __lasx_xvstelm_b(val, des, 0, 0); + return des[0]; + } +}; + +struct v_int8x32 +{ + typedef schar lane_type; + enum { nlanes = 32 }; + __m256i val; + + explicit v_int8x32(__m256i v) : val(v) {} + v_int8x32(schar v0, schar v1, schar v2, schar v3, + schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, + schar v12, schar v13, schar v14, schar v15, + schar v16, schar v17, schar v18, schar v19, + schar v20, schar v21, schar v22, schar v23, + schar v24, schar v25, schar v26, schar v27, + schar v28, schar v29, schar v30, schar v31) + { + val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, + v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + } + /* coverity[uninit_ctor]: suppress warning */ + v_int8x32() {} + + schar get0() const { + schar des[1] = {0}; + __lasx_xvstelm_b(val, des, 0, 0); + return des[0]; + } +}; + +struct v_uint16x16 +{ + typedef ushort lane_type; + enum { nlanes = 16 }; + __m256i val; + + explicit v_uint16x16(__m256i v) : val(v) {} + v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3, + ushort v4, ushort v5, ushort v6, ushort v7, + ushort v8, ushort v9, ushort v10, ushort v11, + ushort v12, ushort v13, ushort v14, ushort v15) + { + val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9, + (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15); + } + /* coverity[uninit_ctor]: suppress warning */ + v_uint16x16() {} + + ushort get0() const { + ushort des[1] = {0}; + __lasx_xvstelm_h(val, des, 0, 0); + return des[0]; + } +}; + +struct v_int16x16 +{ + typedef short lane_type; + enum { nlanes = 16 }; + __m256i val; + + explicit v_int16x16(__m256i v) : val(v) {} + v_int16x16(short v0, short v1, short v2, short v3, + short v4, short v5, short v6, short v7, + short v8, short v9, short v10, short v11, + short v12, short v13, short v14, short v15) + { + val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15); + } + /* coverity[uninit_ctor]: suppress warning */ + v_int16x16() {} + + short get0() const { + short des[1] = {0}; + __lasx_xvstelm_h(val, des, 0, 0); + return des[0]; + } +}; + +struct v_uint32x8 +{ + typedef unsigned lane_type; + enum { nlanes = 8 }; + __m256i val; + + explicit v_uint32x8(__m256i v) : val(v) {} + v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3, + unsigned v4, unsigned v5, unsigned v6, unsigned v7) + { + val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2, + (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7); + } + /* coverity[uninit_ctor]: suppress warning */ + v_uint32x8() {} + + unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); } +}; + +struct v_int32x8 +{ + typedef int lane_type; + enum { nlanes = 8 }; + __m256i val; + + explicit v_int32x8(__m256i v) : val(v) {} + v_int32x8(int v0, int v1, int v2, int v3, + int v4, int v5, int v6, int v7) + { + val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7); + } + /* coverity[uninit_ctor]: suppress warning */ + v_int32x8() {} + + int get0() const { return __lasx_xvpickve2gr_w(val, 0); } +}; + +struct v_float32x8 +{ + typedef float lane_type; + enum { nlanes = 8 }; + __m256 val; + + explicit v_float32x8(__m256 v) : val(v) {} + explicit v_float32x8(__m256i v) { val = *((__m256*)&v); } + v_float32x8(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7) + { + val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7); + } + /* coverity[uninit_ctor]: suppress warning */ + v_float32x8() {} + + float get0() const { + float des[1] = {0}; + __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0); + return des[0]; + } + + int get0toint() const { + int des[1] = {0}; + __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0); + return des[0]; + } +}; + +struct v_uint64x4 +{ + typedef uint64 lane_type; + enum { nlanes = 4 }; + __m256i val; + + explicit v_uint64x4(__m256i v) : val(v) {} + v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3) + { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); } + /* coverity[uninit_ctor]: suppress warning */ + v_uint64x4() {} + + uint64 get0() const + { + return __lasx_xvpickve2gr_du(val, 0); + } +}; + +struct v_int64x4 +{ + typedef int64 lane_type; + enum { nlanes = 4 }; + __m256i val; + + explicit v_int64x4(__m256i v) : val(v) {} + v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3) + { val = _v256_setr_d(v0, v1, v2, v3); } + /* coverity[uninit_ctor]: suppress warning */ + v_int64x4() {} + + int64 get0() const + { + return __lasx_xvpickve2gr_d(val, 0); + } +}; + +struct v_float64x4 +{ + typedef double lane_type; + enum { nlanes = 4 }; + __m256d val; + + explicit v_float64x4(__m256d v) : val(v) {} + explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); } + v_float64x4(double v0, double v1, double v2, double v3) + { val = _v256_setr_pd(v0, v1, v2, v3); } + /* coverity[uninit_ctor]: suppress warning */ + v_float64x4() {} + + double get0() const { + double des[1] = {0}; + __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0); + return des[0]; + } + + int64 get0toint64() const { + int64 des[1] = {0}; + __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0); + return des[0]; + } +}; + +//////////////// Load and store operations /////////////// + +#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp) \ + inline _Tpvec v256_load(const _Tp* ptr) \ + { return _Tpvec(__lasx_xvld(ptr, 0)); } \ + inline _Tpvec v256_load_aligned(const _Tp* ptr) \ + { return _Tpvec(__lasx_xvld(ptr, 0)); } \ + inline _Tpvec v256_load_low(const _Tp* ptr) \ + { \ + __m128i v128 = __lsx_vld(ptr, 0); \ + return _Tpvec(*((__m256i*)&v128)); \ + } \ + inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + __m128i vlo = __lsx_vld(ptr0, 0); \ + __m128i vhi = __lsx_vld(ptr1, 0); \ + return _Tpvec(_v256_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + __lasx_xvst(a.val, ptr, 0); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + __lasx_xvst(a.val, ptr, 0); \ + else \ + __lasx_xvst(a.val, ptr, 0); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(_v256_extract_high(a.val), ptr, 0); } + +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32, uchar) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32, schar) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16, short) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8, unsigned) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8, int) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4, uint64) +OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4, int64) + + +#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \ + inline _Tpvec v256_load(const _Tp* ptr) \ + { return _Tpvec(__lasx_xvld(ptr, 0)); } \ + inline _Tpvec v256_load_aligned(const _Tp* ptr) \ + { return _Tpvec(__lasx_xvld(ptr, 0)); } \ + inline _Tpvec v256_load_low(const _Tp* ptr) \ + { \ + __m128i v128 = __lsx_vld(ptr, 0); \ + return _Tpvec(*((__m256i*)&v128)); \ + } \ + inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + halfreg vlo = __lsx_vld(ptr0, 0); \ + halfreg vhi = __lsx_vld(ptr1, 0); \ + return _Tpvec(_v256_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { __lasx_xvst(a.val, ptr, 0); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + __lasx_xvst(a.val, ptr, 0); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + __lasx_xvst(a.val, ptr, 0); \ + else \ + __lasx_xvst(a.val, ptr, 0); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { __lsx_vst(_v256_extract_high(a.val), ptr, 0); } + +OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i) +OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i) + + +inline __m256i _lasx_256_castps_si256(const __m256& v) +{ return __m256i(v); } + +inline __m256i _lasx_256_castpd_si256(const __m256d& v) +{ return __m256i(v); } + +#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \ + inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \ + { return _Tpvec(cast(a.val)); } + +#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \ + inline _Tpvec v256_setzero_##suffix() \ + { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \ + inline _Tpvec v256_setall_##suffix(_Tp v) \ + { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256) + +OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32, uchar, u8, b, int) +OPENCV_HAL_IMPL_LASX_INIT(v_int8x32, schar, s8, b, int) +OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort, u16, h, int) +OPENCV_HAL_IMPL_LASX_INIT(v_int16x16, short, s16, h, int) +OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8, unsigned, u32, w, int) +OPENCV_HAL_IMPL_LASX_INIT(v_int32x8, int, s32, w, int) +OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4, uint64, u64, d, long int) +OPENCV_HAL_IMPL_LASX_INIT(v_int64x4, int64, s64, d, long int) + + +inline __m256 _lasx_256_castsi256_ps(const __m256i &v) +{ return __m256(v); } + +inline __m256d _lasx_256_castsi256_pd(const __m256i &v) +{ return __m256d(v); } + +#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \ + inline _Tpvec v256_setzero_##suffix() \ + { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \ + inline _Tpvec v256_setall_##suffix(_Tp v) \ + { return _Tpvec(_v256_setall_##zsuffix(v)); } \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, cast) \ + OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, cast) + +OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float, f32, ps, _lasx_256_castsi256_ps) +OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd) + +inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a) +{ return a; } +inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a) +{ return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); } + +inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a) +{ return a; } +inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a) +{ return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); } + + +//////////////// Variant Value reordering /////////////// + +// unpacks +#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix) \ + inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); } \ + inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); } + +OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32, b) +OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32, b) +OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h) +OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16, h) +OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8, w) +OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8, w) +OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4, d) +OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4, d) +OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w) +OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d) + + +// shuffle +// todo: emulate 64bit +#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin) \ + template \ + inline _Tpvec v256_shuffle(const _Tpvec& a) \ + { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); } + +OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8, w) +OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8, w) + +template +inline v_float32x8 v256_shuffle(const v_float32x8 &a) +{ return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); } + +template +inline v_float64x4 v256_shuffle(const v_float64x4 &a) +{ + int imm8 = m & 0b0001; //0 or 1 + if (m & 0x0b0010) imm8 |= 0b0100; + //else imm8 |= 0b0000; + if (m & 0x0b0100) imm8 |= 0b110000; //2 or 3 + else imm8 |= 0b100000; + if (m & 0x0b1000) imm8 |= 0b11000000; + else imm8 |= 0b10000000; + + return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8)); +} +template +inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1) +{ + ab0 = v256_unpacklo(a, b); + ab1 = v256_unpackhi(a, b); +} + +template +inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); } + +inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b) +{ return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); } + +inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b) +{ return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); } + +template +inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b) +{ return v256_permute2x128<0x03>(a, b); } + +inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm) +{ + if (imm == 8) { + return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2 + } else { + __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex)); + } +} + +template +inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); } +inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b) +{ return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2 +// todo: emulate float32 + +template +inline _Tpvec v256_swap_halves(const _Tpvec& a) +{ return v256_permute2x128<1>(a, a); } + +template +inline _Tpvec v256_reverse_64(const _Tpvec& a) +{ return v256_permute4x64<0x1b>(a); } + + +// ZIP +#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec) \ + inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ + { return v256_permute2x128<0x02>(a, b); } \ + inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ + { return v256_permute2x128<0x13>(a, b); } \ + inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ + { \ + _Tpvec a1b0 = v256_alignr_128(a, b); \ + c = v256_combine_diagonal(a, a1b0); \ + d = v256_combine_diagonal(a1b0, b); \ + } \ + inline void v_zip(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& ab0, _Tpvec& ab1) \ + { \ + _Tpvec ab0ab2, ab1ab3; \ + v256_zip(a, b, ab0ab2, ab1ab3); \ + v_recombine(ab0ab2, ab1ab3, ab0, ab1); \ + } + +OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32) +OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32) +OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16) +OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16) +OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8) +OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8) +OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4) +OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4) +OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8) +OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4) + +////////// Arithmetic, bitwise and comparison operations ///////// + +/** Arithmetics **/ +#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { a.val = intrin(a.val, b.val); return a; } + +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d) + +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d) + +// saturating multiply 8-bit, 16-bit +inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +{ + v_uint16x16 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +{ + v_int16x16 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i pl = __lasx_xvmul_h(a.val, b.val); + __m256i ph = __lasx_xvmuh_hu(a.val, b.val); + __m256i p0 = __lasx_xvilvl_h(ph, pl); + __m256i p1 = __lasx_xvilvh_h(ph, pl); + return v_uint16x16(_v256_packs_epu32(p0, p1)); +} +inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +{ + __m256i pl = __lasx_xvmul_h(a.val, b.val); + __m256i ph = __lasx_xvmuh_h(a.val, b.val); + __m256i p0 = __lasx_xvilvl_h(ph, pl); + __m256i p1 = __lasx_xvilvh_h(ph, pl); + return v_int16x16(_lasx_packs_w(p0, p1)); +} +inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) +{ a = a * b; return a; } +inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) +{ a = a * b; return a; } +inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) +{ a = a * b; return a; } +inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) +{ a = a * b; return a; } + +/** Non-saturating arithmetics **/ + +#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \ + inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32, __lasx_xvadd_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32, __lasx_xvadd_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16, __lasx_xvadd_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32, __lasx_xvsub_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32, __lasx_xvsub_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16, __lasx_xvsub_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h) + +inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i ad = __lasx_xvsrai_h(a.val, 8); + __m256i bd = __lasx_xvsrai_h(b.val, 8); + __m256i p0 = __lasx_xvmul_h(a.val, b.val); + __m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8); + + const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00); + return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01)); +} +inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b) +{ + return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); +} + +// Multiply and expand +inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b, + v_uint16x16& c, v_uint16x16& d) +{ + v_uint16x16 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b, + v_int16x16& c, v_int16x16& d) +{ + v_int16x16 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b, + v_int32x8& c, v_int32x8& d) +{ + v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val)); + + v_int16x16 v0, v1; + v_zip(v_mul_wrap(a, b), vhi, v0, v1); + + c = v_reinterpret_as_s32(v0); + d = v_reinterpret_as_s32(v1); +} + +inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b, + v_uint32x8& c, v_uint32x8& d) +{ + v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); + + v_uint16x16 v0, v1; + v_zip(v_mul_wrap(a, b), vhi, v0, v1); + + c = v_reinterpret_as_u32(v0); + d = v_reinterpret_as_u32(v1); +} + +inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b, + v_uint64x4& c, v_uint64x4& d) +{ + __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val); + __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val); + v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d); +} + +inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); } +inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); } + +/** Bitwise shifts **/ +#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ + inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + template \ + inline _Tpuvec v_shl(const _Tpuvec& a) \ + { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + template \ + inline _Tpsvec v_shl(const _Tpsvec& a) \ + { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + template \ + inline _Tpuvec v_shr(const _Tpuvec& a) \ + { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ + template \ + inline _Tpsvec v_shr(const _Tpsvec& a) \ + { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } + +OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h) +OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w) + +inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift) +{ + __m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63); + __m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift); + return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift)); +} +OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, _v256_srai_dx) + + +/** Bitwise logic **/ +#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); } + +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8, v, __lasx_xvreplgr2vr_w(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1)) +OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1)) + +#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; } + +#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); } + +OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps) +OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4, v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd) + +/** Select **/ +#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec) \ + inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); } + +OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32) +OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32) +OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16) +OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16) +OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8) +OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8) + +inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b) +{ return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); } + +inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b) +{ return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); } + +/** Comparison **/ +#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a == b); } \ + inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ + { return b > a; } \ + inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a < b); } \ + inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ + { return b >= a; } + +#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ + inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ + inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + { \ + return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \ + } \ + inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ + inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \ + OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \ + OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec) + +OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32, v_int8x32, b, bu) +OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu) +OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu) + +#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \ + inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a == b); } + +OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d) +OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d) + +#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); } + +#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix) + +OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s) +OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d) + +inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b) +{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); } + +inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b) +{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); } + +inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b) +{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); } + +inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b) +{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); } + +inline v_float32x8 v_not_nan(const v_float32x8& a) +{ return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); } +inline v_float64x4 v_not_nan(const v_float64x4& a) +{ return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); } + +/** min/max **/ +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32, __lasx_xvmin_bu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32, __lasx_xvmax_bu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32, __lasx_xvmin_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32, __lasx_xvmax_b) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16, __lasx_xvmin_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16, __lasx_xvmax_h) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8, __lasx_xvmin_wu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8, __lasx_xvmax_wu) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8, __lasx_xvmin_w) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8, __lasx_xvmax_w) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d) +OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d) + +/** Rotate **/ +template +inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b) +{ + enum {IMM_R = (16 - imm) & 0xFF}; + enum {IMM_R2 = (32 - imm) & 0xFF}; + + if (imm == 0) return a; + if (imm == 32) return b; + if (imm > 32) return v_uint8x32(); + + __m256i swap = _v256_permute2x128<0x21>(a.val, b.val); + if (imm == 16) return v_uint8x32(swap); + if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R)); + return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32 +} + +template +inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b) +{ + enum {IMM_L = (imm - 16) & 0xFF}; + + if (imm == 0) return a; + if (imm == 32) return b; + if (imm > 32) return v_uint8x32(); + + __m256i swap = _v256_permute2x128<0x03>(a.val, b.val); + if (imm == 16) return v_uint8x32(swap); + if (imm < 16) return v_uint8x32(_v256_alignr_b(swap, a.val, imm)); + return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L)); +} + +template +inline v_uint8x32 v_rotate_left(const v_uint8x32& a) +{ + enum {IMM_L = (imm - 16) & 0xFF}; + enum {IMM_R = (16 - imm) & 0xFF}; + + if (imm == 0) return a; + if (imm > 32) return v_uint8x32(); + + // ESAC control[3] ? [127:0] = 0 + __m256i vzero = __lasx_xvreplgr2vr_w(0); + __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);; + if (imm == 16) return v_uint8x32(swapz); + if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R)); + return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L)); +} + +template +inline v_uint8x32 v_rotate_right(const v_uint8x32& a) +{ + enum {IMM_L = (imm - 16) & 0xFF}; + + if (imm == 0) return a; + if (imm > 32) return v_uint8x32(); + + // ESAC control[3] ? [127:0] = 0 + __m256i vzero = __lasx_xvreplgr2vr_w(0); + __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);; + if (imm == 16) return v_uint8x32(swapz); + if (imm < 16) return v_uint8x32(_v256_alignr_b(swapz, a.val, imm)); + return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L)); +} + +#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast) \ + template \ + inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \ + { \ + enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a), \ + v_reinterpret_as_u8(b)); \ + return _Tpvec(cast(ret.val)); \ + } \ + template \ + inline _Tpvec intrin(const _Tpvec& a) \ + { \ + enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a)); \ + return _Tpvec(cast(ret.val)); \ + } + +#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec) \ + OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP) + +OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32) +OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16) +OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16) +OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8) +OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8) +OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4) +OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4) + +OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float32x8, _lasx_256_castsi256_ps) +OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps) +OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float64x4, _lasx_256_castsi256_pd) +OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd) + +/** Reverse **/ +inline v_uint8x32 v_reverse(const v_uint8x32 &a) +{ + static const __m256i perm = _v256_setr_b( + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm); + return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1)); +} + +inline v_int8x32 v_reverse(const v_int8x32 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x16 v_reverse(const v_uint16x16 &a) +{ + static const __m256i perm = _v256_setr_b( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm); + return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1)); +} + +inline v_int16x16 v_reverse(const v_int16x16 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x8 v_reverse(const v_uint32x8 &a) +{ + static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0); + return v_uint32x8(__lasx_xvperm_w(a.val, perm)); +} + +inline v_int32x8 v_reverse(const v_int32x8 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x8 v_reverse(const v_float32x8 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x4 v_reverse(const v_uint64x4 &a) +{ + return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b)); +} + +inline v_int64x4 v_reverse(const v_int64x4 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x4 v_reverse(const v_float64x4 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + +////////// Reduce and mask ///////// + +/** Reduce **/ +// this function is return a[0]+a[1]+...+a[31] +inline unsigned v_reduce_sum(const v_uint8x32& a) +{ + __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val); + __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); + __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); + return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); +} +inline int v_reduce_sum(const v_int8x32& a) +{ + __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val); + __m256i t2 = __lasx_xvhaddw_w_h(t1, t1); + __m256i t3 = __lasx_xvhaddw_d_w(t2, t2); + return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]); +} + + +#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \ + val = intrin(val, __lsx_vbsrl_v(val,8)); \ + val = intrin(val, __lsx_vbsrl_v(val,4)); \ + val = intrin(val, __lsx_vbsrl_v(val,2)); \ + val = intrin(val, __lsx_vbsrl_v(val,1)); \ + return (sctype)__lsx_vpickve2gr_w(val, 0); \ + } + +OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu) +OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, min, __lsx_vmin_b) +OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu) +OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, max, __lsx_vmax_b) + +#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i v0 = _v256_extract_low(a.val); \ + __m128i v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \ + v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \ + v0 = intrin(v0, __lsx_vbsrl_v(v0, 2)); \ + return (sctype) __lsx_vpickve2gr_w(v0, 0); \ + } + +OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu) +OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, min, __lsx_vmin_h) +OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu) +OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, max, __lsx_vmax_h) + +#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i v0 = _v256_extract_low(a.val); \ + __m128i v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \ + v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \ + return (sctype) __lsx_vpickve2gr_w(v0, 0); \ + } + +OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu) +OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, min, __lsx_vmin_w) +OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu) +OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, max, __lsx_vmax_w) + +#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin) \ + inline float v_reduce_##func(const v_float32x8& a) \ + { \ + __m128 v0 = _v256_extract_low(a.val); \ + __m128 v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \ + v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \ + float *fvalue = (float*)&v0; \ + return fvalue[0]; \ + } + +OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s) +OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s) + +inline int v_reduce_sum(const v_int32x8& a) +{ + __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val); + return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]); +} + +inline unsigned v_reduce_sum(const v_uint32x8& a) +{ return v_reduce_sum(v_reinterpret_as_s32(a)); } + +inline int v_reduce_sum(const v_int16x16& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +inline unsigned v_reduce_sum(const v_uint16x16& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } + +inline float v_reduce_sum(const v_float32x8& a) +{ + float result = 0; + float *pa = (float*)&a; + for (int i = 0; i < 2; ++i) { + result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3]; + } + return result; +} + +inline uint64 v_reduce_sum(const v_uint64x4& a) +{ + uint64 *pa = (uint64*)&a; + return pa[0] + pa[1] + pa[2] + pa[3]; +} +inline int64 v_reduce_sum(const v_int64x4& a) +{ + int64 *pa = (int64*)&a; + return pa[0] + pa[1] + pa[2] + pa[3]; +} +inline double v_reduce_sum(const v_float64x4& a) +{ + double *pa = (double*)&a; + return pa[0] + pa[1] + pa[2] + pa[3]; +} + +inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, + const v_float32x8& c, const v_float32x8& d) +{ + float *pa = (float*)&a; + float *pb = (float*)&b; + float *pc = (float*)&c; + float *pd = (float*)&d; + + float v0 = pa[0] + pa[1] + pa[2] + pa[3]; + float v1 = pb[0] + pb[1] + pb[2] + pb[3]; + float v2 = pc[0] + pc[1] + pc[2] + pc[3]; + float v3 = pd[0] + pd[1] + pd[2] + pd[3]; + float v4 = pa[4] + pa[5] + pa[6] + pa[7]; + float v5 = pb[4] + pb[5] + pb[6] + pb[7]; + float v6 = pc[4] + pc[5] + pc[6] + pc[7]; + float v7 = pd[4] + pd[5] + pd[6] + pd[7]; + return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7); +} + +inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i t0 = __lasx_xvabsd_bu(a.val, b.val); + __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0); + __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); + __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); + return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); +} +inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) +{ + __m256i t0 = __lasx_xvabsd_b(a.val, b.val); + __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0); + __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1); + __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2); + return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]); +} +inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_add_wrap(a - b, b - a), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) +{ + return v_reduce_sum(v_max(a, b) - v_min(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 m = a < b; + return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); +} +inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) +{ + v_float32x8 a_b = a - b; + return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff))); +} + +/** Popcount **/ +inline v_uint8x32 v_popcount(const v_uint8x32& a) +{ + __m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + __m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F); + return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)), + __lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask)))); +} +inline v_uint16x16 v_popcount(const v_uint16x16& a) +{ + v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff)); +} +inline v_uint32x8 v_popcount(const v_uint32x8& a) +{ + v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + p += v_rotate_right<2>(p); + return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff)); +} +inline v_uint64x4 v_popcount(const v_uint64x4& a) +{ + v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a)); + uint8_t *pa = (uint8_t*)&atemp; + uint64 v[4]; + for (int i = 0; i < 4; ++i) { + v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7]; + } + return v_uint64x4(v[0], v[1], v[2], v[3]); +} +inline v_uint8x32 v_popcount(const v_int8x32& a) +{ return v_popcount(v_reinterpret_as_u8(a)); } +inline v_uint16x16 v_popcount(const v_int16x16& a) +{ return v_popcount(v_reinterpret_as_u16(a)); } +inline v_uint32x8 v_popcount(const v_int32x8& a) +{ return v_popcount(v_reinterpret_as_u32(a)); } +inline v_uint64x4 v_popcount(const v_int64x4& a) +{ return v_popcount(v_reinterpret_as_u64(a)); } + +/** Mask **/ +#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \ +inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; } +OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(short, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(int, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(float, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64) + +inline int v_signmask(const v_int8x32& a) +{ + int mask = 0; + int8_t *pa = (int8_t*)&a; + for( int i = 0; i < 32; i++ ) + mask |= (reinterpret_int(pa[i]) < 0) << i; + return mask; +} +inline int v_signmask(const v_uint8x32& a) +{ return v_signmask(v_reinterpret_as_s8(a)); } + +inline int v_signmask(const v_int16x16& a) +{ return v_signmask(v_pack(a, a)) & 0xFFFF; } +inline int v_signmask(const v_uint16x16& a) +{ return v_signmask(v_reinterpret_as_s16(a)); } + +inline int v_signmask(const v_int32x8& a) +{ + int mask = 0; + int *pa = (int*)&a; + for( int i = 0; i < 8; i++ ) + mask |= (pa[i] < 0) << i; + return mask; +} +inline int v_signmask(const v_uint32x8& a) +{ return v_signmask(*(v_int32x8*)(&a)); } + +inline int v_signmask(const v_int64x4& a) +{ + int mask = 0; + int64 *pa = (int64*)&a; + for( int i = 0; i < 4; i++ ) + mask |= (pa[i] < 0) << i; + return mask; +} +inline int v_signmask(const v_uint64x4& a) +{ return v_signmask(v_reinterpret_as_s64(a)); } + +inline int v_signmask(const v_float32x8& a) +{ return v_signmask(*(v_int32x8*)(&a)); } + +inline int v_signmask(const v_float64x4& a) +{ return v_signmask(*(v_int64x4*)(&a)); } + +inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } + +/** Checks **/ +#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \ + inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \ + inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; } +OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1) +OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1) +OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255) +OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255) +OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15) +OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15) +OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255) +OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15) + +#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec) \ + inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \ + inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; } +OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16) +OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16) + +////////// Other math ///////// + +/** Some frequent operations **/ +#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix) \ + inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_sqrt(const _Tpvec& x) \ + { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \ + inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_fma(a, a, b * b); } \ + inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_sqrt(v_fma(a, a, b*b)); } + +OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s) +OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d) + +inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) +{ + return a * b + c; +} + +inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) +{ + return v_fma(a, b, c); +} + +inline v_float32x8 v_invsqrt(const v_float32x8& x) +{ + v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5); + v_float32x8 t = v_float32x8(__lasx_xvfrsqrt_s(x.val)); + t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half); + return t; +} + +inline v_float64x4 v_invsqrt(const v_float64x4& x) +{ + return v_float64x4(1., 1., 1., 1.) / v_sqrt(x); +} + +/** Absolute values **/ +#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \ + inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \ + { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); } + +OPENCV_HAL_IMPL_LASX_ABS(int8x32, b) +OPENCV_HAL_IMPL_LASX_ABS(int16x16, h) +OPENCV_HAL_IMPL_LASX_ABS(int32x8, w) + +inline v_float32x8 v_abs(const v_float32x8& x) +{ return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); } +inline v_float64x4 v_abs(const v_float64x4& x) +{ return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); } + +/** Absolute difference **/ +inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b) +{ return v_max(a, b) - v_min(a, b); } + +inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b) +{ + v_int8x32 d = v_sub_wrap(a, b); + v_int8x32 m = a < b; + return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); +} + +inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) +{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } + +inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 d = a - b; + v_int32x8 m = a < b; + return v_reinterpret_as_u32((d ^ m) - m); +} + +inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) +{ return v_abs(a - b); } + +inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) +{ return v_abs(a - b); } + +/** Saturating absolute difference **/ +inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) +{ + v_int8x32 d = a - b; + v_int8x32 m = a < b; + return (d ^ m) - m; +} +inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) +{ return v_max(a, b) - v_min(a, b); } + +////////// Conversions ///////// + +/** Rounding **/ +inline v_int32x8 v_round(const v_float32x8& a) +{ return v_int32x8(__lasx_xvftint_w_s(a.val)); } + +inline v_int32x8 v_round(const v_float64x4& a) +{ __m256i t = __lasx_xvftint_w_d(a.val, a.val); + return v_int32x8(__lasx_xvpermi_d(t, 0x88)); } + +inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b) +{ + __m256i abi = __lasx_xvftint_w_d(b.val, a.val); + return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120 +} + +inline v_int32x8 v_trunc(const v_float32x8& a) +{ return v_int32x8(__lasx_xvftintrz_w_s(a.val)); } + +inline v_int32x8 v_trunc(const v_float64x4& a) +{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val); + return v_int32x8(__lasx_xvpermi_d(t, 0x88)); } + +inline v_int32x8 v_floor(const v_float32x8& a) +{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); } + +inline v_int32x8 v_floor(const v_float64x4& a) +{ return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); } + +inline v_int32x8 v_ceil(const v_float32x8& a) +{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); } + +inline v_int32x8 v_ceil(const v_float64x4& a) +{ return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); } + +/** To float **/ +inline v_float32x8 v_cvt_f32(const v_int32x8& a) +{ return v_float32x8(__lasx_xvffint_s_w(a.val)); } + +inline v_float32x8 v_cvt_f32(const v_float64x4& a) +{ return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); } + +inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b) +{ + __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val); //warnning: order of a,b is diff from instruction xvfcvt.s.d + return v_float32x8(__lasx_xvpermi_d(abf, 0x8D)); +} + +inline v_float64x4 v_cvt_f64(const v_int32x8& a) +{ + __m256i alow = __lasx_xvpermi_d(a.val, 0x10); + return v_float64x4(__lasx_xvffintl_d_w(alow)); +} + +inline v_float64x4 v_cvt_f64_high(const v_int32x8& a) +{ + __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32); + return v_float64x4(__lasx_xvffintl_d_w(ahigh)); +} + +inline v_float64x4 v_cvt_f64(const v_float32x8& a) +{ + __m256i alow = __lasx_xvpermi_d(a.val, 0x10); + return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow)); +} + +inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) +{ + __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32); + return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh)); +} + +// from (Mysticial and wim) https://stackoverflow.com/q/41144668 +inline v_float64x4 v_cvt_f64(const v_int64x4& v) +{ + // constants encoded as floating-point + __m256i magic_i_lo = __lasx_xvreplgr2vr_d(0x4330000000000000); + __m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000); + __m256i magic_i_all = __lasx_xvreplgr2vr_d(0x4530000080100000); + __m256d magic_d_all = _lasx_256_castsi256_pd(magic_i_all); + + // Blend the 32 lowest significant bits of v with magic_int_lo + __m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1); + __m256i v_lo = __lasx_xvbitsel_v(magic_i_lo, v.val, mask); + // Extract the 32 most significant bits of v + __m256i v_hi = __lasx_xvsrli_d(v.val, 32); + // Flip the msb of v_hi and blend with 0x45300000 + v_hi = __lasx_xvxor_v(v_hi, magic_i_hi32); + // Compute in double precision + __m256d v_hi_dbl = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all); + // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition + __m256d result = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo)); + return v_float64x4(result); +} + +////////////// Lookup table access //////////////////// + +inline v_int8x32 v256_lut(const schar* tab, const int* idx) +{ + return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], + tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], + tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]], + tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]], + tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], + tab[idx[30]], tab[idx[31]])); +} +inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx) +{ + return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), + *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), + *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]), + *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]), + *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), + *(const short*)(tab + idx[15]))); +} +inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx) +{ + return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]), + *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]), + *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]))); +} +inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); } +inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); } +inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); } + +inline v_int16x16 v256_lut(const short* tab, const int* idx) +{ + return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], + tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], + tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], + tab[idx[15]])); +} +inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx) +{ + return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]), + *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]), + *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) )); +} +inline v_int16x16 v256_lut_quads(const short* tab, const int* idx) +{ + return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]), + *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) )); + +} +inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); } +inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); } +inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); } + +inline v_int32x8 v256_lut(const int* tab, const int* idx) +{ + return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), + *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]), + *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]), + *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) )); +} +inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx) +{ + return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]), + *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) )); +} +inline v_int32x8 v256_lut_quads(const int* tab, const int* idx) +{ + return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); +} +inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); } +inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); } +inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); } + +inline v_int64x4 v256_lut(const int64* tab, const int* idx) +{ + return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]), + *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) )); +} +inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx) +{ + return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); +} +inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); } +inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); } + +inline v_float32x8 v256_lut(const float* tab, const int* idx) +{ + return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); +} +inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); } +inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); } + +inline v_float64x4 v256_lut(const double* tab, const int* idx) +{ + return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} +inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) +{ return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); } + +inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec) +{ + int *idx = (int*)&idxvec.val; + return v256_lut(tab, idx); +} + +inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); +} + +inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec) +{ + const int *idx = (const int*)&idxvec.val; + return v256_lut(tab, idx); +} + +inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec) +{ + const int *idx = (const int*)&idxvec.val; + return v256_lut(tab, idx); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y) +{ + const int *idx = (const int*)&idxvec.val; + __m128i xy01, xy45, xy23, xy67; + xy01 = __lsx_vld(tab + idx[0], 0); + xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10); + xy45 = __lsx_vld(tab + idx[4], 0); + xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10); + __m256i xy0145 = _v256_combine(xy01, xy45); + xy23 = __lsx_vld(tab + idx[2], 0); + xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10); + xy67 = __lsx_vld(tab + idx[6], 0); + xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10); + __m256i xy2367 = _v256_combine(xy23, xy67); + + __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145); + __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145); + + x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145)); + y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y) +{ + //int CV_DECL_ALIGNED(32) idx[4]; + const int *idx = (const int*)&idxvec.val; + __m128i xy0 = __lsx_vld(tab + idx[0], 0); + __m128i xy2 = __lsx_vld(tab + idx[2], 0); + __m128i xy1 = __lsx_vld(tab + idx[1], 0); + __m128i xy3 = __lsx_vld(tab + idx[3], 0); + __m256i xy02 = _v256_combine(xy0, xy2); + __m256i xy13 = _v256_combine(xy1, xy3); + + x = v_float64x4(__lasx_xvilvl_d(xy13, xy02)); + y = v_float64x4(__lasx_xvilvh_d(xy13, xy02)); +} + +inline v_int8x32 v_interleave_pairs(const v_int8x32& vec) +{ + return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200))); +} +inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) +{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x32 v_interleave_quads(const v_int8x32& vec) +{ + return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400))); +} +inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) +{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x16 v_interleave_pairs(const v_int16x16& vec) +{ + return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100))); +} +inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) +{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x16 v_interleave_quads(const v_int16x16& vec) +{ + return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100))); +} +inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) +{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x8 v_interleave_pairs(const v_int32x8& vec) +{ + return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8)); +} +inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) +{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) +{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x32 v_pack_triplets(const v_int8x32& vec) +{ + __m256i vzero = __lasx_xvreplgr2vr_w(0); + __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100)); + __m256i t2 = __lasx_xvshuf_b(vzero, t1, + _v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100)); + return v_int8x32(__lasx_xvperm_w(t2, + _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) +{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x16 v_pack_triplets(const v_int16x16& vec) +{ + __m256i vzero = __lasx_xvreplgr2vr_w(0); + __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val, + _v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100)); + __m256i t2 = __lasx_xvshuf_b(vzero, t1, + _v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100)); + return v_int16x16(__lasx_xvperm_w(t2, + _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) +{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + +inline v_int32x8 v_pack_triplets(const v_int32x8& vec) +{ + return v_int32x8(__lasx_xvperm_w(vec.val, + _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} +inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) +{ return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); } +inline v_float32x8 v_pack_triplets(const v_float32x8& vec) +{ + return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val), + _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000))); +} + +////////// Matrix operations ///////// + +//////// Dot Product //////// + +// 16 >> 32 +inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) +{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); } + +inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) +{ return v_dotprod(a, b) + c; } + +// 32 >> 64 +inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) +{ + __m256i even = __lasx_xvmulwev_d_w(a.val, b.val); + __m256i odd = __lasx_xvmulwod_d_w(a.val, b.val); + return v_int64x4(__lasx_xvadd_d(even, odd)); +} +inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00); + __m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m); + __m256i odd_a = __lasx_xvsrli_h(a.val, 8); + + __m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m); + __m256i odd_b = __lasx_xvsrli_h(b.val, 8); + + __m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b)); + __m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b)); + return v_uint32x8(__lasx_xvadd_w(prod0, prod1)); +} +inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) +{ + __m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8); + __m256i odd_a = __lasx_xvsrai_h(a.val, 8); + + __m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8); + __m256i odd_b = __lasx_xvsrai_h(b.val, 8); + + __m256i prod0 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b)); + __m256i prod1 = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b)); + return v_int32x8(__lasx_xvadd_w(prod0, prod1)); +} +inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i mullo = __lasx_xvmul_h(a.val, b.val); + __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val); + __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo); + __m256i mul1 = __lasx_xvilvh_h(mulhi, mullo); + + __m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); + __m256i p13 = __lasx_xvsrli_d(mul0, 32); + __m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); + __m256i p57 = __lasx_xvsrli_d(mul1, 32); + + __m256i p15_ = __lasx_xvadd_d(p02, p13); + __m256i p9d_ = __lasx_xvadd_d(p46, p57); + + return v_uint64x4(__lasx_xvadd_d( + __lasx_xvilvl_d(p9d_, p15_), + __lasx_xvilvh_d(p9d_, p15_))); +} +inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) +{ + __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val)); + __m256i sign = __lasx_xvsrai_w(prod, 31); + + __m256i lo = __lasx_xvilvl_w(sign, prod); + __m256i hi = __lasx_xvilvh_w(sign, prod); + + return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo))); +} +inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b) +{ return v_dotprod(a, b); } +inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b) +{ return v_dotprod(a, b); } +inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i mullo = __lasx_xvmul_h(a.val, b.val); + __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val); + __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo); + __m256i mul1 = __lasx_xvilvh_h(mulhi, mullo); + + __m256i p02 = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); + __m256i p13 = __lasx_xvsrli_d(mul0, 32); + __m256i p46 = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0)); + __m256i p57 = __lasx_xvsrli_d(mul1, 32); + + __m256i p15_ = __lasx_xvadd_d(p02, p13); + __m256i p9d_ = __lasx_xvadd_d(p46, p57); + + return v_uint64x4(__lasx_xvadd_d(p15_, p9d_)); +} +inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) +{ + __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val)); + __m256i sign = __lasx_xvsrai_w(prod, 31); + __m256i lo = __lasx_xvilvl_w(sign, prod); + __m256i hi = __lasx_xvilvh_w(sign, prod); + return v_int64x4(__lasx_xvadd_d(lo, hi)); +} +inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) +{ return v_dotprod_expand(a, b, c); } + + +#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \ + v_float32x8(__lasx_xvpermi_w(a.val, a.val, im)) + +inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, + const v_float32x8& m1, const v_float32x8& m2, + const v_float32x8& m3) +{ + v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0); + v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55); + v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA); + v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); +} + +inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, + const v_float32x8& m1, const v_float32x8& m2, + const v_float32x8& a) +{ + v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0); + v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55); + v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a))); +} + + +#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to) \ + inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \ + { \ + __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val)); \ + __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val)); \ + __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val)); \ + __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val)); \ + b0.val = cast_to(__lasx_xvilvl_d(t1, t0)); \ + b1.val = cast_to(__lasx_xvilvh_d(t1, t0)); \ + b2.val = cast_to(__lasx_xvilvl_d(t3, t2)); \ + b3.val = cast_to(__lasx_xvilvh_d(t3, t2)); \ + } + +OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) + +inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1, + const v_float32x8 &a2, const v_float32x8 &a3, + v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3) +{ + __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val)); + __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val)); + __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val)); + __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val)); + b0.val = __m256(__lasx_xvilvl_d(t1, t0)); + b1.val = __m256(__lasx_xvilvh_d(t1, t0)); + b2.val = __m256(__lasx_xvilvl_d(t3, t2)); + b3.val = __m256(__lasx_xvilvh_d(t3, t2)); +} + +//////////////// Value reordering /////////////// + +/* Expand */ +#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \ + inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ + { \ + b0.val = intrin(a.val); \ + b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11)); \ + } \ + inline _Tpwvec v_expand_low(const _Tpvec& a) \ + { return _Tpwvec(intrin(a.val)); } \ + inline _Tpwvec v_expand_high(const _Tpvec& a) \ + { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \ + inline _Tpwvec v256_load_expand(const _Tp* ptr) \ + { \ + __m128i a = __lsx_vld(ptr, 0); \ + return _Tpwvec(intrin(*((__m256i*)&a))); \ + } + +OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32, v_uint16x16, uchar, __lasx_vext2xv_hu_bu) +OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32, v_int16x16, schar, __lasx_vext2xv_h_b) +OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8, ushort, __lasx_vext2xv_wu_hu) +OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16, v_int32x8, short, __lasx_vext2xv_w_h) +OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8, v_uint64x4, unsigned, __lasx_vext2xv_du_wu) +OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8, v_int64x4, int, __lasx_vext2xv_d_w) + +#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin) \ + inline _Tpvec v256_load_expand_q(const _Tp* ptr) \ + { \ + __m128i a = __lsx_vld(ptr, 0); \ + return _Tpvec(intrin(*((__m256i*)&a))); \ + } + +OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu) +OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8, schar, __lasx_vext2xv_w_b) + +/* pack */ +// 16 +inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b) +{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); } + +inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i t = __lasx_xvreplgr2vr_h(255); + __m256i a1 = __lasx_xvmin_hu(a.val, t); + __m256i b1 = __lasx_xvmin_hu(b.val, t); + return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1))); +} + +inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) +{ + return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val))); +} + +inline void v_pack_store(schar* ptr, const v_int16x16& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(uchar* ptr, const v_uint16x16& a) +{ + const __m256i m = __lasx_xvreplgr2vr_h(255); + __m256i am = __lasx_xvmin_hu(a.val, m); + am = _v256_shuffle_odd_64(_lasx_packus_h(am, am)); + v_store_low(ptr, v_uint8x32(am)); +} + +inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + +template inline +v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); + return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), + v_reinterpret_as_s16((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) +{ + v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); + v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); +} + +template inline +v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(schar* ptr, const v_int16x16& a) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + v_pack_store(ptr, (a + delta) >> n); +} + +// 32 +inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b) +{ return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); } + +inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b) +{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); } + +inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b) +{ return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); } + +inline void v_pack_store(short* ptr, const v_int32x8& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(ushort* ptr, const v_uint32x8& a) +{ + const __m256i m = __lasx_xvreplgr2vr_w(65535); + __m256i am = __lasx_xvmin_wu(a.val, m); + am = _v256_shuffle_odd_64(_lasx_packus_w(am, am)); + v_store_low(ptr, v_uint16x16(am)); +} + +inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + + +template inline +v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b) +{ + // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers. + v_uint32x8 delta = v256_setall_u32(1 << (n-1)); + return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), + v_reinterpret_as_s32((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) +{ + v_uint32x8 delta = v256_setall_u32(1 << (n-1)); + v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); +} + +template inline +v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(short* ptr, const v_int32x8& a) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +// 64 +// Non-saturating pack +inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b) +{ + __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08); + __m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08); + __m256i ab = __lasx_xvilvl_d(b0, a0); + return v_uint32x8(_v256_shuffle_odd_64(ab)); +} + +inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b) +{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } + +inline void v_pack_store(unsigned* ptr, const v_uint64x4& a) +{ + __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08); + v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0))); +} + +inline void v_pack_store(int* ptr, const v_int64x4& b) +{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); } + +template inline +v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) +{ + v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) +{ + v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +template inline +v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) +{ + v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(int* ptr, const v_int64x4& a) +{ + v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +// pack boolean +inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i ab = _lasx_packs_h(a.val, b.val); + return v_uint8x32(_v256_shuffle_odd_64(ab)); +} + +inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b, + const v_uint32x8& c, const v_uint32x8& d) +{ + __m256i ab = _lasx_packs_w(a.val, b.val); + __m256i cd = _lasx_packs_w(c.val, d.val); + + __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd)); + return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8)); +} + +inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c, + const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f, + const v_uint64x4& g, const v_uint64x4& h) +{ + __m256i ab = _lasx_packs_w(a.val, b.val); + __m256i cd = _lasx_packs_w(c.val, d.val); + __m256i ef = _lasx_packs_w(e.val, f.val); + __m256i gh = _lasx_packs_w(g.val, h.val); + + __m256i abcd = _lasx_packs_w(ab, cd); + __m256i efgh = _lasx_packs_w(ef, gh); + __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh)); + + __m256i rev = _v256_alignr_b(pkall, pkall, 8); + return v_uint8x32(__lasx_xvilvl_h(rev, pkall)); +} + +/* Recombine */ +// its up there with load and store operations + +/* Extract */ +#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec) \ + template \ + inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ + { return v_rotate_right(a, b); } + +OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8) +OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4) + +template +inline uchar v_extract_n(v_uint8x32 a) +{ + return (uchar)_v256_extract_b(a.val); +} + +template +inline schar v_extract_n(v_int8x32 a) +{ + return (schar)v_extract_n(v_reinterpret_as_u8(a)); +} + +template +inline ushort v_extract_n(v_uint16x16 a) +{ + return (ushort)_v256_extract_h(a.val); +} + +template +inline short v_extract_n(v_int16x16 a) +{ + return (short)v_extract_n(v_reinterpret_as_u16(a)); +} + +template +inline uint v_extract_n(v_uint32x8 a) +{ + return (uint)_v256_extract_w(a.val); +} + +template +inline int v_extract_n(v_int32x8 a) +{ + return (int)v_extract_n(v_reinterpret_as_u32(a)); +} + +template +inline uint64 v_extract_n(v_uint64x4 a) +{ + return (uint64)_v256_extract_d(a.val); +} + +template +inline int64 v_extract_n(v_int64x4 v) +{ + return (int64)v_extract_n(v_reinterpret_as_u64(v)); +} + +template +inline float v_extract_n(v_float32x8 v) +{ + union { uint iv; float fv; } d; + d.iv = v_extract_n(v_reinterpret_as_u32(v)); + return d.fv; +} + +template +inline double v_extract_n(v_float64x4 v) +{ + union { uint64 iv; double dv; } d; + d.iv = v_extract_n(v_reinterpret_as_u64(v)); + return d.dv; +} + +template +inline v_uint32x8 v_broadcast_element(v_uint32x8 a) +{ + static const __m256i perm = __lasx_xvreplgr2vr_w((char)i); + return v_uint32x8(__lasx_xvperm_w(a.val, perm)); +} + +template +inline v_int32x8 v_broadcast_element(const v_int32x8 &a) +{ return v_reinterpret_as_s32(v_broadcast_element(v_reinterpret_as_u32(a))); } + +template +inline v_float32x8 v_broadcast_element(const v_float32x8 &a) +{ return v_reinterpret_as_f32(v_broadcast_element(v_reinterpret_as_u32(a))); } + + +///////////////////// load deinterleave ///////////////////////////// + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b ) +{ + __m256i ab0 = __lasx_xvld(ptr, 0); + __m256i ab1 = __lasx_xvld(ptr + 32, 0); + + const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh); + __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh); + __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); + __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); + __m256i a0 = __lasx_xvilvl_d(ph, pl); + __m256i b0 = __lasx_xvilvh_d(ph, pl); + a = v_uint8x32(a0); + b = v_uint8x32(b0); +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b ) +{ + __m256i ab0 = __lasx_xvld(ptr, 0); + __m256i ab1 = __lasx_xvld(ptr + 16, 0); + + const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh); + __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh); + __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); + __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); + __m256i a0 = __lasx_xvilvl_d(ph, pl); + __m256i b0 = __lasx_xvilvh_d(ph, pl); + a = v_uint16x16(a0); + b = v_uint16x16(b0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b ) +{ + __m256i ab0 = __lasx_xvld(ptr, 0); + __m256i ab1 = __lasx_xvld(ptr + 8, 0); + + //const int sh = 0+2*4+1*16+3*64; + __m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8); + __m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8); + __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02); + __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13); + __m256i a0 = __lasx_xvilvl_d(ph, pl); + __m256i b0 = __lasx_xvilvh_d(ph, pl); + a = v_uint32x8(a0); + b = v_uint32x8(b0); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b ) +{ + __m256i ab0 = __lasx_xvld(ptr, 0); + __m256i ab1 = __lasx_xvld(ptr + 4, 0); + + __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02); + __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13); + __m256i a0 = __lasx_xvilvl_d(ph, pl); + __m256i b0 = __lasx_xvilvh_d(ph, pl); + a = v_uint64x4(a0); + b = v_uint64x4(b0); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 32, 0); + __m256i bgr2 = __lasx_xvld(ptr + 64, 0); + + __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); + __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); + + const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1); + + __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0); + __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1); + + const __m256i + sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, + 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13), + sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, + 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14), + sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, + 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + b0 = __lasx_xvshuf_b(b0, b0, sh_b); + g0 = __lasx_xvshuf_b(g0, g0, sh_g); + r0 = __lasx_xvshuf_b(r0, r0, sh_r); + + a = v_uint8x32(b0); + b = v_uint8x32(g0); + c = v_uint8x32(r0); +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 16, 0); + __m256i bgr2 = __lasx_xvld(ptr + 32, 0); + + __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); + __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); + + const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); + __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1); + __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0); + const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, + 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + b0 = __lasx_xvshuf_b(b0, b0, sh_b); + g0 = __lasx_xvshuf_b(g0, g0, sh_g); + r0 = __lasx_xvshuf_b(r0, r0, sh_r); + + a = v_uint16x16(b0); + b = v_uint16x16(g0); + c = v_uint16x16(r0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 8, 0); + __m256i bgr2 = __lasx_xvld(ptr + 16, 0); + + __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02); + __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13); + + __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0); + __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0); + __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92); + __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24); + __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92); + + b0 = __lasx_xvshuf4i_w(b0, 0x6c); + g0 = __lasx_xvshuf4i_w(g0, 0xb1); + r0 = __lasx_xvshuf4i_w(r0, 0xc6); + + a = v_uint32x8(b0); + b = v_uint32x8(g0); + c = v_uint32x8(r0); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 4, 0); + __m256i bgr2 = __lasx_xvld(ptr + 8, 0); + + __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128 + __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12); + __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b); + __m256i b0 = __lasx_xvilvl_d(s20r, s01); + __m256i g0 = _v256_alignr_b(s12, s01, 8); + __m256i r0 = __lasx_xvilvh_d(s12, s20r); + + a = v_uint64x4(b0); + b = v_uint64x4(g0); + c = v_uint64x4(r0); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 32, 0); + __m256i bgr2 = __lasx_xvld(ptr + 64, 0); + __m256i bgr3 = __lasx_xvld(ptr + 96, 0); + const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh); + __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh); + __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh); + __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh); + + __m256i p01l = __lasx_xvilvl_w(p1, p0); + __m256i p01h = __lasx_xvilvh_w(p1, p0); + __m256i p23l = __lasx_xvilvl_w(p3, p2); + __m256i p23h = __lasx_xvilvh_w(p3, p2); + + __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02); + __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13); + __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02); + __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13); + + __m256i b0 = __lasx_xvilvl_w(plh, pll); + __m256i g0 = __lasx_xvilvh_w(plh, pll); + __m256i r0 = __lasx_xvilvl_w(phh, phl); + __m256i a0 = __lasx_xvilvh_w(phh, phl); + + a = v_uint8x32(b0); + b = v_uint8x32(g0); + c = v_uint8x32(r0); + d = v_uint8x32(a0); +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d ) +{ + __m256i bgr0 = __lasx_xvld(ptr, 0); + __m256i bgr1 = __lasx_xvld(ptr + 16, 0); + __m256i bgr2 = __lasx_xvld(ptr + 32, 0); + __m256i bgr3 = __lasx_xvld(ptr + 48, 0); + const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh); + __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh); + __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh); + __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh); + + __m256i p01l = __lasx_xvilvl_w(p1, p0); + __m256i p01h = __lasx_xvilvh_w(p1, p0); + __m256i p23l = __lasx_xvilvl_w(p3, p2); + __m256i p23h = __lasx_xvilvh_w(p3, p2); + + __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02); + __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13); + __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02); + __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13); + + __m256i b0 = __lasx_xvilvl_w(plh, pll); + __m256i g0 = __lasx_xvilvh_w(plh, pll); + __m256i r0 = __lasx_xvilvl_w(phh, phl); + __m256i a0 = __lasx_xvilvh_w(phh, phl); + + a = v_uint16x16(b0); + b = v_uint16x16(g0); + c = v_uint16x16(r0); + d = v_uint16x16(a0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d ) +{ + __m256i p0 = __lasx_xvld(ptr, 0); + __m256i p1 = __lasx_xvld(ptr + 8, 0); + __m256i p2 = __lasx_xvld(ptr + 16, 0); + __m256i p3 = __lasx_xvld(ptr + 24, 0); + + __m256i p01l = __lasx_xvilvl_w(p1, p0); + __m256i p01h = __lasx_xvilvh_w(p1, p0); + __m256i p23l = __lasx_xvilvl_w(p3, p2); + __m256i p23h = __lasx_xvilvh_w(p3, p2); + + __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02); + __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13); + __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02); + __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13); + + __m256i b0 = __lasx_xvilvl_w(plh, pll); + __m256i g0 = __lasx_xvilvh_w(plh, pll); + __m256i r0 = __lasx_xvilvl_w(phh, phl); + __m256i a0 = __lasx_xvilvh_w(phh, phl); + + a = v_uint32x8(b0); + b = v_uint32x8(g0); + c = v_uint32x8(r0); + d = v_uint32x8(a0); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d ) +{ + __m256i bgra0 = __lasx_xvld(ptr, 0); + __m256i bgra1 = __lasx_xvld(ptr + 4, 0); + __m256i bgra2 = __lasx_xvld(ptr + 8, 0); + __m256i bgra3 = __lasx_xvld(ptr + 12, 0); + + __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02); + __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13); + __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02); + __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13); + + __m256i b0 = __lasx_xvilvl_d(l13, l02); + __m256i g0 = __lasx_xvilvh_d(l13, l02); + __m256i r0 = __lasx_xvilvl_d(h13, h02); + __m256i a0 = __lasx_xvilvh_d(h13, h02); + + a = v_uint64x4(b0); + b = v_uint64x4(g0); + c = v_uint64x4(r0); + d = v_uint64x4(a0); +} + +///////////////////////////// store interleave ///////////////////////////////////// + +inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i xy_l = __lasx_xvilvl_b(y.val, x.val); + __m256i xy_h = __lasx_xvilvh_b(y.val, x.val); + + __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16); + __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16); + + __lasx_xvst(xy0, (__m256i*)ptr, 0); + __lasx_xvst(xy1, (__m256i*)ptr, 32*1); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i xy_l = __lasx_xvilvl_h(y.val, x.val); + __m256i xy_h = __lasx_xvilvh_h(y.val, x.val); + + __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16); + __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16); + + __lasx_xvst(xy0, (__m256i*)ptr, 0); + __lasx_xvst(xy1, (__m256i*)ptr, 16*2); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i xy_l = __lasx_xvilvl_w(y.val, x.val); + __m256i xy_h = __lasx_xvilvh_w(y.val, x.val); + + __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16); + __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16); + + __lasx_xvst(xy0, (__m256i*)ptr, 0); + __lasx_xvst(xy1, (__m256i*)ptr, 8*4); +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i xy_l = __lasx_xvilvl_d(y.val, x.val); + __m256i xy_h = __lasx_xvilvh_d(y.val, x.val); + + __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16); + __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16); + + __lasx_xvst(xy0, (__m256i*)ptr, 0); + __lasx_xvst(xy1, (__m256i*)ptr, 4*8); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + const __m256i sh_b = _v256_setr_b( + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + const __m256i sh_g = _v256_setr_b( + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + const __m256i sh_r = _v256_setr_b( + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); + + __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b); + __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g); + __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r); + + const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + + __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1); + __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1); + __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1); + + __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16); + __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16); + __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16); + + __lasx_xvst(bgr0, (__m256i*)ptr, 0); + __lasx_xvst(bgr1, (__m256i*)ptr, 32); + __lasx_xvst(bgr2, (__m256i*)ptr, 64); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + const __m256i sh_b = _v256_setr_b( + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + const __m256i sh_g = _v256_setr_b( + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + const __m256i sh_r = _v256_setr_b( + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + + __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b); + __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g); + __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r); + + const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); + + __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1); + __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1); + __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1); + + __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16); + __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16); + + __lasx_xvst(bgr0, (__m256i*)ptr, 0); + __lasx_xvst(p1, (__m256i*)ptr, 16*2); + __lasx_xvst(bgr2, (__m256i*)ptr, 32*2); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c); + __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1); + __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6); + + __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0); + __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0); + + __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2); + __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2); + __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2); + + __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16); + __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16); + + __lasx_xvst(bgr0, (__m256i*)ptr, 0); + __lasx_xvst(p2, (__m256i*)ptr, 8*4); + __lasx_xvst(bgr2, (__m256i*)ptr, 16*4); +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i s01 = __lasx_xvilvl_d(b.val, a.val); + __m256i s12 = __lasx_xvilvh_d(c.val, b.val); + __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4); + + __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16); + __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30); + __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16); + + __lasx_xvst(bgr0, (__m256i*)ptr, 0); + __lasx_xvst(bgr1, (__m256i*)ptr, 4*8); + __lasx_xvst(bgr2, (__m256i*)ptr, 8*8); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, + const v_uint8x32& c, const v_uint8x32& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i bg0 = __lasx_xvilvl_b(b.val, a.val); + __m256i bg1 = __lasx_xvilvh_b(b.val, a.val); + __m256i ra0 = __lasx_xvilvl_b(d.val, c.val); + __m256i ra1 = __lasx_xvilvh_b(d.val, c.val); + + __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0); + __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0); + __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1); + __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1); + + __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16); + __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16); + __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16); + __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16); + + __lasx_xvst(bgra0, (__m256i*)ptr, 0); + __lasx_xvst(bgra1, (__m256i*)ptr, 32); + __lasx_xvst(bgra2, (__m256i*)ptr, 64); + __lasx_xvst(bgra3, (__m256i*)ptr, 96); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, + const v_uint16x16& c, const v_uint16x16& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i bg0 = __lasx_xvilvl_h(b.val, a.val); + __m256i bg1 = __lasx_xvilvh_h(b.val, a.val); + __m256i ra0 = __lasx_xvilvl_h(d.val, c.val); + __m256i ra1 = __lasx_xvilvh_h(d.val, c.val); + + __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0); + __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0); + __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1); + __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1); + + __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16); + __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16); + __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16); + __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16); + + __lasx_xvst(bgra0, (__m256i*)ptr, 0); + __lasx_xvst(bgra1, (__m256i*)ptr, 16*2); + __lasx_xvst(bgra2, (__m256i*)ptr, 32*2); + __lasx_xvst(bgra3, (__m256i*)ptr, 48*2); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, + const v_uint32x8& c, const v_uint32x8& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i bg0 = __lasx_xvilvl_w(b.val, a.val); + __m256i bg1 = __lasx_xvilvh_w(b.val, a.val); + __m256i ra0 = __lasx_xvilvl_w(d.val, c.val); + __m256i ra1 = __lasx_xvilvh_w(d.val, c.val); + + __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0); + __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0); + __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1); + __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1); + + __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16); + __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16); + __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16); + __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16); + + __lasx_xvst(bgra0, (__m256i*)ptr, 0); + __lasx_xvst(bgra1, (__m256i*)ptr, 8*4); + __lasx_xvst(bgra2, (__m256i*)ptr, 16*4); + __lasx_xvst(bgra3, (__m256i*)ptr, 24*4); +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, + const v_uint64x4& c, const v_uint64x4& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) +{ + __m256i bg0 = __lasx_xvilvl_d(b.val, a.val); + __m256i bg1 = __lasx_xvilvh_d(b.val, a.val); + __m256i ra0 = __lasx_xvilvl_d(d.val, c.val); + __m256i ra1 = __lasx_xvilvh_d(d.val, c.val); + + __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16); + __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16); + __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16); + __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16); + + __lasx_xvst(bgra0, (__m256i*)ptr, 0); + __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8); + __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8); + __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8); +} + + +#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \ +} + +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8) +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16) +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) +OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) + +// +// FP16 +// + +inline v_float32x8 v256_load_expand(const float16_t* ptr) +{ +#if CV_FP16 + //1-load128, 2-permi, 3-cvt + return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10))); +#else + float CV_DECL_ALIGNED(32) buf[8]; + for (int i = 0; i < 8; i++) + buf[i] = (float)ptr[i]; + return v256_load_aligned(buf); +#endif +} + +inline void v_pack_store(float16_t* ptr, const v_float32x8& a) +{ +#if CV_FP16 + __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val); + __lsx_vst((_m128i)ah, ptr, 0); +#else + float CV_DECL_ALIGNED(32) buf[8]; + v_store_aligned(buf, a); + for (int i = 0; i < 8; i++) + ptr[i] = float16_t(buf[i]); +#endif +} + +// +// end of FP16 +// + +inline void v256_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: + +#endif // OPENCV_HAL_INTRIN_LASX_HPP diff --git a/modules/core/src/parallel_impl.cpp b/modules/core/src/parallel_impl.cpp index c118bcd3cb..708627c072 100644 --- a/modules/core/src/parallel_impl.cpp +++ b/modules/core/src/parallel_impl.cpp @@ -59,6 +59,8 @@ DECLARE_CV_PAUSE // https://github.com/riscv/riscv-isa-manual/issues/43 // # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0) # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0) +# elif defined __GNUC__ && defined __loongarch__ +# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0) # else # warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags." # define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 4e7e1b7ea0..027072a5da 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -434,6 +434,8 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL"; g_hwFeatureNames[CPU_RVV] = "RVV"; + + g_hwFeatureNames[CPU_LASX] = "LASX"; } void initialize(void) @@ -689,6 +691,10 @@ struct HWFeatures have[CV_CPU_RVV] = true; #endif + #if defined __loongarch_asx + have[CV_CPU_LASX] = true; + #endif + bool skip_baseline_check = false; #ifndef NO_GETENV if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK")) diff --git a/modules/core/test/test_hal_core.cpp b/modules/core/test/test_hal_core.cpp index 35fb977478..f9078e55f9 100644 --- a/modules/core/test/test_hal_core.cpp +++ b/modules/core/test/test_hal_core.cpp @@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp) int size = (hcase / 2) % 4; size = size == 0 ? 3 : size == 1 ? 4 : size == 2 ? 6 : 15; int nfunc = (hcase / 8); + #if CV_LASX + double eps = depth == CV_32F ? 1e-5 : 2e-10; + #else double eps = depth == CV_32F ? 1e-5 : 1e-10; + #endif if( size == 3 ) return; // TODO ??? diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index e0773d5214..08cda81819 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -8,8 +8,8 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") -ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV) -ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX) +ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX) +ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js) diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp index dfa58b09fe..320a18e5ab 100644 --- a/modules/dnn/src/int8layers/convolution_layer.cpp +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -579,13 +579,14 @@ public: bool is1x1_; bool useAVX2; bool useAVX512; + bool useLASX; int blk_size_cn; int inpZp, outZp; const std::vector* multiplier; ParallelConv() : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), - biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false) + biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false) , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0) {} @@ -641,6 +642,8 @@ public: p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; + p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D; + int kernel_d = isConv3D? kernel_size[0] : 1; int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; int kernel_w = kernel_size.back(); @@ -837,6 +840,13 @@ public: stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); else + #endif + #if CV_TRY_LASX + if(useLASX) + opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); + else #endif { const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], @@ -1210,6 +1220,12 @@ public: opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); else + #endif + #if CV_TRY_LASX + if(useLASX) + opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); + else #endif for( int i = 0; i < outCn; i += 2 ) { diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp index dc759ebdbc..867f002dd4 100644 --- a/modules/dnn/src/int8layers/fully_connected_layer.cpp +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -226,7 +226,7 @@ public: { public: FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0), - dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {} + dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {} static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier, const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp) @@ -250,6 +250,7 @@ public: p.activ = !activationLUT.empty() ? activ : 0; p.useAVX2 = checkHardwareSupport(CPU_AVX2); p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; + p.useLASX = checkHardwareSupport(CPU_LASX); parallel_for_(Range(0, nstripes), p, nstripes); } @@ -294,6 +295,11 @@ public: if( useAVX2 ) opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); else + #endif + #if CV_TRY_LASX + if( useLASX ) + opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); + else #endif { int i = 0; @@ -349,6 +355,7 @@ public: int nstripes, outZp; bool useAVX2; bool useAVX512; + bool useLASX; }; void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp index bf6149e5c9..1b3ac7a4b8 100644 --- a/modules/dnn/src/int8layers/layers_common.simd.hpp +++ b/modules/dnn/src/int8layers/layers_common.simd.hpp @@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights, } #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX + +inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c) +{ + __m256i vzero = __lasx_xvreplgr2vr_d(0); + __m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b); + __m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b); + + __m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16); + __m256i odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16); + + return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c); +} + +enum { FASCONV_BASE_VECSZ = 4 }; + +void fastConv( const int8_t* weights, size_t wstep, const int* bias, + const int8_t* rowbuf, int* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float* multiplier, bool initOutput, bool finalOutput ) +{ + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2]*outShape[3]; + int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; + int rsz = blockSize % FASCONV_BASE_VECSZ; + for( int i = 0; i < rsz; i++ ) + maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; + __m128i mask = __lsx_vld((const float*)maskbuf, 0); + + // now compute dot product of the weights + // and im2row-transformed part of the tensor + for( int i = 0; i < outCn; i += 3 ) + { + const int8_t* wptr0 = weights + i*wstep; + const int8_t* wptr1 = wptr0 + wstep; + const int8_t* wptr2 = wptr1 + wstep; + int* outptr0 = output + i*outPlaneSize; + int* outptr1 = outptr0 + outPlaneSize; + int* outptr2 = outptr1 + outPlaneSize; + int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; + float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2]; + + if( i+2 >= outCn ) + { + wptr2 = wptr1; + outptr2 = outptr1; + bias2 = bias1; + mult2 = mult1; + + if( i+1 >= outCn ) + { + wptr2 = wptr1 = wptr0; + outptr2 = outptr1 = outptr0; + bias2 = bias1 = bias0; + mult2 = mult1 = mult0; + } + } + int j = 0; + for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) + { + bool tail = false; + if (j + FASCONV_BASE_VECSZ > blockSize) + { + if (j == 0) + break; + j = blockSize - FASCONV_BASE_VECSZ; + tail = true; + } + int k = 0; + const int8_t* rptr = rowbuf + j*vecsize_aligned; + + __m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0), + vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0), + vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0), + vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0), + vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0), + vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0); + + for (; k < vecsize; k += 32, rptr += 32 ) + { + __m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0); + __m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0); + __m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0); + __m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0); + + vs00 = _v256_fmadds8_s32(w0, r0, vs00); + vs10 = _v256_fmadds8_s32(w1, r0, vs10); + vs20 = _v256_fmadds8_s32(w2, r0, vs20); + + r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0); + vs01 = _v256_fmadds8_s32(w0, r0, vs01); + vs11 = _v256_fmadds8_s32(w1, r0, vs11); + vs21 = _v256_fmadds8_s32(w2, r0, vs21); + + r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0); + vs02 = _v256_fmadds8_s32(w0, r0, vs02); + vs12 = _v256_fmadds8_s32(w1, r0, vs12); + vs22 = _v256_fmadds8_s32(w2, r0, vs22); + + r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0); + vs03 = _v256_fmadds8_s32(w0, r0, vs03); + vs13 = _v256_fmadds8_s32(w1, r0, vs13); + vs23 = _v256_fmadds8_s32(w2, r0, vs23); + } + + /*t0*/ + __m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00); + __m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w); + + __m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01); + __m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w); + + __m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02); + __m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w); + + __m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03); + __m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w); + + __m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d); + __m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d); + __m256i t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00); + + /*t1*/ + __m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10); + __m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w); + + __m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11); + __m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w); + + __m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12); + __m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w); + + __m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13); + __m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w); + + __m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d); + __m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d); + __m256i t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10); + + /*t2*/ + __m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20); + __m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w); + + __m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21); + __m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w); + + __m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22); + __m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w); + + __m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23); + __m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w); + + __m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d); + __m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d); + __m256i t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20); + + t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1)); + t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1)); + t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1)); + + __m128i s0, s1, s2; + + if( initOutput ) + { + s0 = __lsx_vreplgr2vr_w(bias0); + s1 = __lsx_vreplgr2vr_w(bias1); + s2 = __lsx_vreplgr2vr_w(bias2); + } + else + { + s0 = __lsx_vld((__m128i*)(outptr0 + j), 0); + s1 = __lsx_vld((__m128i*)(outptr1 + j), 0); + s2 = __lsx_vld((__m128i*)(outptr2 + j), 0); + } + + s0 = __lsx_vadd_w(s0, *(__m128i*)&t0); + s1 = __lsx_vadd_w(s1, *(__m128i*)&t1); + s2 = __lsx_vadd_w(s2, *(__m128i*)&t2); + + if( finalOutput ) + { + __m128i voutzp = __lsx_vreplgr2vr_w(outZp); + __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127); + __m256 v_mult0 = _v256_setall_ps(mult0); + __m256 v_mult1 = _v256_setall_ps(mult1); + __m256 v_mult2 = _v256_setall_ps(mult2); + + s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0))); + s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1))); + s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2))); + + s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax); + s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax); + s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax); + } + if( tail ) + { + s0 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0), s0, mask); + s1 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0), s1, mask); + s2 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0), s2, mask); + } + __lsx_vst(s0, (__m128i*)(outptr0 + j), 0); + __lsx_vst(s1, (__m128i*)(outptr1 + j), 0); + __lsx_vst(s2, (__m128i*)(outptr2 + j), 0); + } + + for( ; j <= blockSize - 2; j += 2 ) + { + const int8_t* rptr0 = rowbuf + j*vecsize_aligned; + const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned; + int s00, s01, s10, s11, s20, s21; + + if( initOutput ) + { + s00 = s01 = bias0; + s10 = s11 = bias1; + s20 = s21 = bias2; + } + else + { + s00 = outptr0[j]; s01 = outptr0[j+1]; + s10 = outptr1[j]; s11 = outptr1[j+1]; + s20 = outptr2[j]; s21 = outptr2[j+1]; + } + + for( int k = 0; k < vecsize; k++ ) + { + int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + int8_t r = rptr0[k]; + s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; + r = rptr1[k]; + s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r; + } + + if( finalOutput ) + { + s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); + s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127); + s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); + s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127); + s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); + s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127); + } + outptr0[j] = s00; + outptr0[j+1] = s01; + outptr1[j] = s10; + outptr1[j+1] = s11; + outptr2[j] = s20; + outptr2[j+1] = s21; + } + + for( ; j < blockSize; j++ ) + { + const int8_t* rptr0 = rowbuf + j*vecsize_aligned; + int s00, s10, s20; + + if( initOutput ) + { + s00 = bias0; + s10 = bias1; + s20 = bias2; + } + else + { + s00 = outptr0[j]; + s10 = outptr1[j]; + s20 = outptr2[j]; + } + + for( int k = 0; k < vecsize; k++ ) + { + int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + int8_t r = rptr0[k]; + s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; + } + + if( finalOutput ) + { + s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); + s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); + s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); + } + outptr0[j] = s00; + outptr1[j] = s10; + outptr2[j] = s20; + } + } +} + +static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b, + __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3) +{ + __m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0); + __m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0); + + __m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0); + __m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0); + + __m256i a0b0 = __lasx_xvmul_h(a0, b0); + __m256i a1b1 = __lasx_xvmul_h(a1, b1); + + out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0)); + out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0)); + out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0)); + out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0)); +} + +static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b) +{ + __m256i t0 = __lasx_xvld((const __m256i*)ptr, 0); + __m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1); + + const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m256i p0 = __lasx_xvshuf_b(t0, t0, sh); + __m256i p1 = __lasx_xvshuf_b(t1, t1, sh); + __m256i lo = __lasx_xvpermi_q(p0, p1, 0x02); + __m256i hi = __lasx_xvpermi_q(p0, p1, 0x13); + + a = __lasx_xvilvl_d(hi, lo); + b = __lasx_xvilvh_d(hi, lo); +} + +void fastDepthwiseConv( const int8_t* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int* biasptr, const float* multptr, + const int8_t* inptr_, + int height, int width, + int* outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp) +{ + const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float mult = multptr[out_d]; + int bias = biasptr[out_d]; + int biasCopy; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const int8_t* imgptr0 = inptr_ + in_i*width; + const int8_t* imgptr1 = imgptr0 + dilation_h*width; + const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width; + int8_t w00 = w00_, w01 = w01_, w02 = w02_; + int8_t w20 = w20_, w21 = w21_, w22 = w22_; + int out; + biasCopy = bias; + if (in_i < 0) + { + biasCopy += inpZp * (w00 + w01 + w02); + w00 = w01 = w02 = 0; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + biasCopy += inpZp * (w20 + w21 + w22); + w20 = w21 = w22 = 0; + imgptr2 = imgptr1; + } + int* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 + + (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 + + (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 + + biasCopy + inpZp*(w00 + w10 + w20); + outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 32; + __m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02), + vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12), + vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22); + __m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp), + outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127); + __m256 vmult = _v256_setall_ps(mult); + __m256i vout0, vout1, vout2, vout3; + + if( stride_w == 1 ) + { + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - VECSZ; + } + int in_j = out_j * stride_w - pad_l; + __m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0), + v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0), + v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0), + v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0), + v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0), + v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0), + v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0), + v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0), + v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0); + + vout0 = vout1 = vout2 = vout3 = vbias; + _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); + + vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult))); + vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult))); + vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult))); + vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult))); + + vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax); + vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax); + vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax); + vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax); + + __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0); + __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4); + __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4); + __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4); + } + } + else + { + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - VECSZ; + } + int in_j = out_j * stride_w - pad_l; + __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _v256_load_deinterleave(imgptr0 + in_j, v00, v01); + _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _v256_load_deinterleave(imgptr1 + in_j, v10, v11); + _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _v256_load_deinterleave(imgptr2 + in_j, v20, v21); + _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + vout0 = vout1 = vout2 = vout3 = vbias; + _v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); + _v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); + + vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult))); + vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult))); + vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult))); + vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult))); + + vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax); + vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax); + vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax); + vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax); + + __lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0); + __lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4); + __lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4); + __lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4); + } + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 + + (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 + + (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + int s0 = 1, s1 = 1, s2 = 1; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0; + biasCopy += inpZp*(w00 + w10 + w20); + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0; + biasCopy += inpZp*(w01 + w11 + w21); + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0; + biasCopy += inpZp*(w02 + w12 + w22); + } + out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 + + (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 + + (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + } +} + +// dst = vec * weights^t + bias +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ) +{ + int i = 0; + + for( ; i <= nvecs - 8; i += 8 ) + { + const int8_t* wptr = weights + i*wstep; + __m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0), + vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0), + vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0), + vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0); + + __m128i voutzp = __lsx_vreplgr2vr_w(outZp); + __m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127); + + for( int k = 0; k < vecsize; k += 32, wptr += 32 ) + { + __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0); + + vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0); + vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1); + vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2); + vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3); + vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4); + vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5); + vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6); + vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7); + } + + /*s0*/ + __m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0); + __m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w); + + __m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1); + __m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w); + + __m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2); + __m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w); + + __m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3); + __m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w); + + __m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d); + __m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d); + __m256i s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0); + + /*s1*/ + __m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4); + __m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w); + + __m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5); + __m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w); + + __m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6); + __m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w); + + __m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7); + __m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w); + + __m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d); + __m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d); + __m256i s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4); + + s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1)); + s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1)); + + __m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0)); + __m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4)); + + t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0)))); + t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4)))); + + t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax); + t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax); + + __lsx_vst(t0, (__m128i*)(dst + i), 0); + __lsx_vst(t1, (__m128i*)(dst + i), 4*4); + } + + for( ; i < nvecs; i++ ) + { + const int8_t* wptr = weights + i*wstep; + __m256i vs0 = __lasx_xvreplgr2vr_d(0); + + for( int k = 0; k < vecsize; k += 32, wptr += 32 ) + { + __m256i v = __lasx_xvld((const __m256i*)(vec + k), 0); + vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0); + } + + __m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0); + int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3]; + dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]); + } + +} +#endif // CV_LASX + CV_CPU_OPTIMIZATION_NAMESPACE_END }} // namespace diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 7bb3014fc0..678a052c7c 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -986,12 +986,13 @@ public: bool useAVX2; bool useAVX512; bool useRVV; + bool useLASX; int blk_size_cn; ParallelConv() : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) - , blk_size_cn(0) + , useLASX(false), blk_size_cn(0) {} static void run( const Mat& input, Mat& output, const Mat& weights, @@ -1049,6 +1050,7 @@ public: p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D; + p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D; int kernel_d = isConv3D? kernel_size[0] : 1; int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; @@ -1256,6 +1258,13 @@ public: stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); else + #endif + #if CV_TRY_LASX + if(useLASX) + opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); + else #endif { const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], @@ -1631,6 +1640,12 @@ public: opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else + #endif + #if CV_TRY_LASX + if(useLASX) + opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, relu, cn0 == 0); + else #endif for( int i = 0; i < outCn; i += 2 ) { @@ -2437,6 +2452,7 @@ public: useAVX2 = checkHardwareSupport(CPU_AVX2); useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; useRVV = checkHardwareSupport(CPU_RVV); + useLASX = checkHardwareSupport(CPU_LASX); } void operator()(const Range& range_) const CV_OVERRIDE @@ -2474,6 +2490,11 @@ public: opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); } else + #endif + #if CV_TRY_LASX + if( useLASX ) + opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + else #endif for( m = 0; m < mmax; m += 2 ) { @@ -2574,6 +2595,7 @@ public: bool useAVX2; bool useAVX512; bool useRVV; + bool useLASX; }; class Col2ImInvoker : public cv::ParallelLoopBody diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 509f6cc177..71ca706ac4 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -173,7 +173,7 @@ public: class FullyConnected : public ParallelLoopBody { public: - FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {} + FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {} static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, const ActivationLayer* activ, int nstripes) @@ -197,6 +197,7 @@ public: p.useAVX2 = checkHardwareSupport(CPU_AVX2); p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; p.useRVV = checkHardwareSupport(CPU_RVV); + p.useLASX = checkHardwareSupport(CPU_LASX); parallel_for_(Range(0, nstripes), p, nstripes); } @@ -250,6 +251,11 @@ public: if( useRVV ) opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else + #endif + #if CV_TRY_LASX + if( useLASX ) + opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + else #endif { int i = 0; @@ -305,6 +311,7 @@ public: bool useAVX2; bool useAVX512; bool useRVV; + bool useLASX; }; #ifdef HAVE_OPENCL diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index fd88a3c3d2..f706abfa76 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr, #endif // CV_RVV +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX + +enum { FASCONV_BASE_VECSZ = 4 }; + +void fastConv( const float* weights, size_t wstep, const float* bias, + const float* rowbuf, float* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, + const float* relu, bool initOutput ) +{ + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2]*outShape[3]; + float r0 = 1.f, r1 = 1.f, r2 = 1.f; + __m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f); + __m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2; + int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; + int rsz = blockSize % FASCONV_BASE_VECSZ; + for( int i = 0; i < rsz; i++ ) + maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; + __m128i mask = __lsx_vld((const float*)maskbuf, 0); + + // now compute dot product of the weights + // and im2row-transformed part of the tensor + for( int i = 0; i < outCn; i += 3 ) + { + const float* wptr0 = weights + i*wstep; + const float* wptr1 = wptr0 + wstep; + const float* wptr2 = wptr1 + wstep; + float* outptr0 = output + i*outPlaneSize; + float* outptr1 = outptr0 + outPlaneSize; + float* outptr2 = outptr1 + outPlaneSize; + float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; + + if( i+2 >= outCn ) + { + wptr2 = wptr1; + outptr2 = outptr1; + bias2 = bias1; + if( i+1 >= outCn ) + { + wptr2 = wptr1 = wptr0; + outptr2 = outptr1 = outptr0; + bias2 = bias1 = bias0; + } + } + + if( relu ) + { + r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2]; + if( i+2 >= outCn ) + { + r2 = r1; + if( i+1 >= outCn ) + r2 = r1 = r0; + } + vr0 = _v256_extract_low(_v256_setall_ps(r0)); + vr1 = _v256_extract_low(_v256_setall_ps(r1)); + vr2 = _v256_extract_low(_v256_setall_ps(r2)); + } + + int j = 0; + for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) + { + bool tail = false; + if (j + FASCONV_BASE_VECSZ > blockSize) + { + if (j == 0) + break; + j = blockSize - FASCONV_BASE_VECSZ; + tail = true; + } + int k = 0; + const float* rptr = rowbuf + j*vecsize_aligned; + + __m256i tmp; + __m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp), + vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp), + vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp), + vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp), + vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp), + vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp); + + for (; k < vecsize; k += 8, rptr += 8 ) + { + __m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0); + __m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0); + __m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0); + __m256 r0 = (__m256)__lasx_xvld(rptr, 0); + + vs00 = __lasx_xvfmadd_s(w0, r0, vs00); + vs10 = __lasx_xvfmadd_s(w1, r0, vs10); + vs20 = __lasx_xvfmadd_s(w2, r0, vs20); + + r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0); + vs01 = __lasx_xvfmadd_s(w0, r0, vs01); + vs11 = __lasx_xvfmadd_s(w1, r0, vs11); + vs21 = __lasx_xvfmadd_s(w2, r0, vs21); + + r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0); + vs02 = __lasx_xvfmadd_s(w0, r0, vs02); + vs12 = __lasx_xvfmadd_s(w1, r0, vs12); + vs22 = __lasx_xvfmadd_s(w2, r0, vs22); + + r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0); + vs03 = __lasx_xvfmadd_s(w0, r0, vs03); + vs13 = __lasx_xvfmadd_s(w1, r0, vs13); + vs23 = __lasx_xvfmadd_s(w2, r0, vs23); + } + + /*t0*/ + __m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm); + __m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32); + __m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl); + + __m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm); + __m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32); + __m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl); + + __m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm); + __m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32); + __m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl); + + __m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm); + __m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32); + __m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl); + + __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w); + __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w); + __m256 t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00); + + /*t1*/ + __m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm); + __m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32); + __m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl); + + __m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm); + __m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32); + __m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl); + + __m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm); + __m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32); + __m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl); + + __m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm); + __m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32); + __m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl); + + __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w); + __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w); + __m256 t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10); + + /*t2*/ + __m256 vs20_perm = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm); + __m256 tmp20_srl = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32); + __m256 vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl); + + __m256 vs21_perm = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm); + __m256 tmp21_srl = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32); + __m256 vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl); + + __m256 vs22_perm = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm); + __m256 tmp22_srl = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32); + __m256 vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl); + + __m256 vs23_perm = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm); + __m256 tmp23_srl = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32); + __m256 vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl); + + __m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w); + __m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w); + __m256 t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20); + + t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1)); + t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1)); + t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1)); + + __m128 s0, s1, s2; + + if( initOutput ) + { + s0 = _v256_extract_low(_v256_setall_ps(bias0)); + s1 = _v256_extract_low(_v256_setall_ps(bias1)); + s2 = _v256_extract_low(_v256_setall_ps(bias2)); + } + else + { + s0 = (__m128)__lsx_vld(outptr0 + j, 0); + s1 = (__m128)__lsx_vld(outptr1 + j, 0); + s2 = (__m128)__lsx_vld(outptr2 + j, 0); + } + + s0 = __lsx_vfadd_s(s0, *(__m128*)&t0); + s1 = __lsx_vfadd_s(s1, *(__m128*)&t1); + s2 = __lsx_vfadd_s(s2, *(__m128*)&t2); + + if( relu ) + { + __m128i m0 = __lsx_vfcmp_clt_s(z, s0); + __m128i m1 = __lsx_vfcmp_clt_s(z, s1); + __m128i m2 = __lsx_vfcmp_clt_s(z, s2); + s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0); + s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1); + s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2); + } + + if( tail ) + { + s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask); + s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask); + s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask); + } + + __lsx_vst(s0, outptr0 + j, 0); + __lsx_vst(s1, outptr1 + j, 0); + __lsx_vst(s2, outptr2 + j, 0); + } + + for( ; j <= blockSize - 2; j += 2 ) + { + const float* rptr0 = rowbuf + j*vecsize_aligned; + const float* rptr1 = rowbuf + (j+1)*vecsize_aligned; + float s00, s01, s10, s11, s20, s21; + + if( initOutput ) + { + s00 = s01 = bias0; + s10 = s11 = bias1; + s20 = s21 = bias2; + } + else + { + s00 = outptr0[j]; s01 = outptr0[j+1]; + s10 = outptr1[j]; s11 = outptr1[j+1]; + s20 = outptr2[j]; s21 = outptr2[j+1]; + } + + for( int k = 0; k < vecsize; k++ ) + { + float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + float r = rptr0[k]; + s00 += w0*r; s10 += w1*r; s20 += w2*r; + r = rptr1[k]; + s01 += w0*r; s11 += w1*r; s21 += w2*r; + } + + if( relu ) + { + s00 = s00 > 0.f ? s00 : s00*r0; + s01 = s01 > 0.f ? s01 : s01*r0; + s10 = s10 > 0.f ? s10 : s10*r1; + s11 = s11 > 0.f ? s11 : s11*r1; + s20 = s20 > 0.f ? s20 : s20*r2; + s21 = s21 > 0.f ? s21 : s21*r2; + } + + outptr0[j] = s00; + outptr0[j+1] = s01; + outptr1[j] = s10; + outptr1[j+1] = s11; + outptr2[j] = s20; + outptr2[j+1] = s21; + } + + for( ; j < blockSize; j++ ) + { + const float* rptr0 = rowbuf + j*vecsize_aligned; + float s00, s10, s20; + + if( initOutput ) + { + s00 = bias0; + s10 = bias1; + s20 = bias2; + } + else + { + s00 = outptr0[j]; + s10 = outptr1[j]; + s20 = outptr2[j]; + } + + for( int k = 0; k < vecsize; k++ ) + { + float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + float r = rptr0[k]; + s00 += w0*r; s10 += w1*r; s20 += w2*r; + } + + if( relu ) + { + s00 = s00 > 0.f ? s00 : s00*r0; + s10 = s10 > 0.f ? s10 : s10*r1; + s20 = s20 > 0.f ? s20 : s20*r2; + } + + outptr0[j] = s00; + outptr1[j] = s10; + outptr2[j] = s20; + } + } +} + +static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b) +{ + __m256 t0 = (__m256)__lasx_xvld(ptr, 0); + __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4); + + __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16); + __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16); + + a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88); + b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd); +} + +void fastDepthwiseConv( const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW ) +{ + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 8; + __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02), + vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12), + vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22); + __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00), + vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff); + + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0), + v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0), + v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0), + v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0), + v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0), + v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0), + v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0), + v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0), + v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0); + + __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); + __m256 vout1 = __lasx_xvfmul_s(v01, vw01); + __m256 vout2 = __lasx_xvfmul_s(v02, vw02); + + vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); + vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); + vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); + + vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); + vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); + vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); + + vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); + if (relu) + { + __m256i m = __lasx_xvfcmp_clt_s(z, vout0); + vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); + } + __lasx_xvst(vout0, outptr + out_j, 0); + } + else + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _v256_load_deinterleave(imgptr0 + in_j, v00, v01); + _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _v256_load_deinterleave(imgptr1 + in_j, v10, v11); + _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _v256_load_deinterleave(imgptr2 + in_j, v20, v21); + _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); + __m256 vout1 = __lasx_xvfmul_s(v01, vw01); + __m256 vout2 = __lasx_xvfmul_s(v02, vw02); + + vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); + vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); + vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); + + vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); + vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); + vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); + + vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); + if (relu) + { + __m256i m = __lasx_xvfcmp_clt_s(z, vout0); + vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); + } + __lasx_xvst(vout0, outptr + out_j, 0); + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } +} + +// dst = vec * weights^t + bias +void fastGEMM1T( const float* vec, const float* weights, + size_t wstep, const float* bias, + float* dst, int nvecs, int vecsize ) +{ + int i = 0; + __m256i v256_tmp; + + for( ; i <= nvecs - 8; i += 8 ) + { + const float* wptr = weights + i*wstep; + __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), + vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), + vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), + vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + + for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + { + __m256 v = (__m256)__lasx_xvld(vec + k, 0); + + vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0); + vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1); + vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2); + vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3); + vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4); + vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5); + vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6); + vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7); + } + + /*s0*/ + __m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm); + __m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32); + __m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl); + + __m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm); + __m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32); + __m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl); + + __m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm); + __m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32); + __m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl); + + __m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm); + __m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32); + __m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl); + + __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w); + __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w); + __m256 s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00); + + /*s1*/ + __m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm); + __m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32); + __m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl); + + __m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm); + __m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32); + __m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl); + + __m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm); + __m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32); + __m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl); + + __m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm); + __m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32); + __m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl); + + __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w); + __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w); + __m256 s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10); + + s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1)); + s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1)); + + s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0)); + s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4)); + + __lsx_vst(*(__m128*)&s0, dst + i, 0); + __lsx_vst(*(__m128*)&s1, dst + i, 4*4); + } + + float temp = 0.f; + for( ; i < nvecs; i++ ) + { + const float* wptr = weights + i*wstep; + __m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + + for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + { + __m256 v = (__m256)__lasx_xvld(vec + k, 0); + vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0); + } + + __m256i vs0_perm = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1); + __m256 vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm); + __m256i tmp_srl = __lasx_xvsrli_d(vs0_add_2w, 32); + __m256 vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl); + temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4]; + dst[i] = temp + bias[i]; + } +} + + +void fastGEMM( const float* aptr, size_t astep, const float* bptr, + size_t bstep, float* cptr, size_t cstep, + int ma, int na, int nb ) +{ + int n = 0; + + for( ; n <= nb - 16; n += 16 ) + { + for( int m = 0; m < ma; m += 4 ) + { + const float* aptr0 = aptr + astep*m; + const float* aptr1 = aptr + astep*std::min(m+1, ma-1); + const float* aptr2 = aptr + astep*std::min(m+2, ma-1); + const float* aptr3 = aptr + astep*std::min(m+3, ma-1); + + float* cptr0 = cptr + cstep*m; + float* cptr1 = cptr + cstep*std::min(m+1, ma-1); + float* cptr2 = cptr + cstep*std::min(m+2, ma-1); + float* cptr3 = cptr + cstep*std::min(m+3, ma-1); + + __m256i v256_tmp; + __m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + __m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + __m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + __m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp); + + for( int k = 0; k < na; k++ ) + { + __m256 a0 = _v256_setall_ps(aptr0[k]); + __m256 a1 = _v256_setall_ps(aptr1[k]); + __m256 a2 = _v256_setall_ps(aptr2[k]); + __m256 a3 = _v256_setall_ps(aptr3[k]); + + __m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0); + __m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0); + d00 = __lasx_xvfmadd_s(a0, b0, d00); + d01 = __lasx_xvfmadd_s(a0, b1, d01); + d10 = __lasx_xvfmadd_s(a1, b0, d10); + d11 = __lasx_xvfmadd_s(a1, b1, d11); + d20 = __lasx_xvfmadd_s(a2, b0, d20); + d21 = __lasx_xvfmadd_s(a2, b1, d21); + d30 = __lasx_xvfmadd_s(a3, b0, d30); + d31 = __lasx_xvfmadd_s(a3, b1, d31); + } + + __lasx_xvst(d00, cptr0 + n, 0); + __lasx_xvst(d01, cptr0 + n, 8*4); + __lasx_xvst(d10, cptr1 + n, 0); + __lasx_xvst(d11, cptr1 + n, 8*4); + __lasx_xvst(d20, cptr2 + n, 0); + __lasx_xvst(d21, cptr2 + n, 8*4); + __lasx_xvst(d30, cptr3 + n, 0); + __lasx_xvst(d31, cptr3 + n, 8*4); + } + } + + for( ; n < nb; n++ ) + { + for( int m = 0; m < ma; m++ ) + { + const float* aptr0 = aptr + astep*m; + float* cptr0 = cptr + cstep*m; + float d0 = 0.f; + + for( int k = 0; k < na; k++ ) + d0 += aptr0[k]*bptr[k*bstep + n]; + + cptr0[n] = d0; + } + } +} + +#endif // CV_LASX + CV_CPU_OPTIMIZATION_NAMESPACE_END }} // namespace diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index f57d05faee..c6dd063f73 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2178,6 +2178,9 @@ public: #if CV_TRY_SSE4_1 bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; #endif + #if CV_TRY_LASX + bool useLASX = CV_CPU_HAS_SUPPORT_LASX; + #endif int bh0 = std::min(BLOCK_SZ/2, dst.rows); int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols); @@ -2241,6 +2244,10 @@ public: if ( useAVX2 ) x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw); #endif + #if CV_TRY_LASX + if ( useLASX ) + x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw); + #endif #if CV_SIMD128 { v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0); diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp index 1f8f1c5d17..4b81b5e79d 100644 --- a/modules/imgproc/src/imgwarp.hpp +++ b/modules/imgproc/src/imgwarp.hpp @@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X #endif } +namespace opt_LASX +{ +#if CV_TRY_LASX +int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw); +#endif +} + namespace opt_SSE4_1 { #if CV_TRY_SSE4_1 diff --git a/modules/imgproc/src/imgwarp.lasx.cpp b/modules/imgproc/src/imgwarp.lasx.cpp new file mode 100644 index 0000000000..f6bf2a13cb --- /dev/null +++ b/modules/imgproc/src/imgwarp.lasx.cpp @@ -0,0 +1,98 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "imgwarp.hpp" +#include "opencv2/core/hal/intrin.hpp" + +namespace cv +{ +namespace opt_LASX +{ + +int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) +{ + const int AB_BITS = MAX(10, (int)INTER_BITS); + int x1 = 0; + __m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1); + __m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0); + for (; x1 <= bw - 16; x1 += 16) + { + __m256i tx0, tx1, ty0, ty1; + tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX); + ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY); + tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX); + ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY); + + tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS); + ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS); + tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS); + ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS); + + __m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask), + __lasx_xvand_v(tx1, fxy_mask)); + __m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask), + __lasx_xvand_v(ty1, fxy_mask)); + tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS), + __lasx_xvsrai_w(tx1, INTER_BITS)); + ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS), + __lasx_xvsrai_w(ty1, INTER_BITS)); + fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS)); + fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); + + __lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0); + __lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2); + __lasx_xvst(fx_, (__m256i*)(alpha + x1), 0); + } + return x1; +} + +} +} +/* End of file. */ diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 90a05085e3..4ad64534be 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify); } else +#endif +#if CV_TRY_LASX + if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4))) + { + if(pix_size == 2) + opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify); + else + opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify); + } + else #endif { resizeNNInvoker invoker(src, dst, x_ofs, ify); diff --git a/modules/imgproc/src/resize.hpp b/modules/imgproc/src/resize.hpp index 67cf5184af..1636e7585e 100644 --- a/modules/imgproc/src/resize.hpp +++ b/modules/imgproc/src/resize.hpp @@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double); int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width); #endif } + +namespace opt_LASX +{ +#if CV_TRY_LASX +void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double); +void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double); +#endif +} + } #endif /* End of file. */ diff --git a/modules/imgproc/src/resize.lasx.cpp b/modules/imgproc/src/resize.lasx.cpp new file mode 100644 index 0000000000..fece47087d --- /dev/null +++ b/modules/imgproc/src/resize.lasx.cpp @@ -0,0 +1,249 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "resize.hpp" +#include "opencv2/core/hal/intrin.hpp" + +namespace cv +{ +namespace opt_LASX +{ + +class resizeNNInvokerLASX4 CV_FINAL : + public ParallelLoopBody +{ +public: + resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), + ify(_ify) + { + } + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int avxWidth = width - (width & 0x7); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val; + __lasx_xvst(pixels, (int*)D, 0); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val; + __lasx_xvst(pixels, (int*)D, 0); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat& src; + Mat& dst; + int* x_ofs; + double ify; + + resizeNNInvokerLASX4(const resizeNNInvokerLASX4&); + resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&); +}; + +class resizeNNInvokerLASX2 CV_FINAL : + public ParallelLoopBody +{ +public: + resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), + ify(_ify) + { + } + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int avxWidth = width - (width & 0xf); + const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val; + + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val; + + const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000); + __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask); + __lasx_xvst(ints_permuted, (int*)D, 0); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val; + + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val; + + const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000); + __m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask); + __lasx_xvst(ints_permuted, (int*)D, 0); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat& src; + Mat& dst; + int* x_ofs; + double ify; + + resizeNNInvokerLASX2(const resizeNNInvokerLASX2&); + resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&); +}; + +void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) +{ + resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) +{ + resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +} +} +/* End of file. */