Merge pull request #24565 from CNClareChen:4.x

Change the lsx to baseline features.
pull/24655/head
Alexander Smorkalov 1 year ago committed by GitHub
commit 3893936243
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      cmake/OpenCVCompilerOptimizations.cmake
  2. 26
      modules/core/CMakeLists.txt
  3. 6
      modules/core/include/opencv2/core/hal/intrin.hpp
  4. 47
      modules/core/include/opencv2/core/hal/intrin_lasx.hpp
  5. 15
      modules/core/src/system.cpp

@ -403,11 +403,8 @@ elseif(LOONGARCH64)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
ocv_update(CPU_LSX_FLAGS_ON "-mlsx")
ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX")
set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
else()
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
set(CPU_DISPATCH "LASX" CACHE STRING "${HELP_CPU_DISPATCH}")
endif()

@ -1,21 +1,21 @@
set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
ocv_add_dispatched_file(has_non_zero SSE2 AVX2)
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
ocv_add_dispatched_file(mean SSE2 AVX2)
ocv_add_dispatched_file(merge SSE2 AVX2)
ocv_add_dispatched_file(split SSE2 AVX2)
ocv_add_dispatched_file(sum SSE2 AVX2)
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2 LASX)
ocv_add_dispatched_file(stat SSE4_2 AVX2 LASX)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3 LASX)
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3 LASX)
ocv_add_dispatched_file(convert_scale SSE2 AVX2 LASX)
ocv_add_dispatched_file(count_non_zero SSE2 AVX2 LASX)
ocv_add_dispatched_file(has_non_zero SSE2 AVX2 LASX )
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD LASX)
ocv_add_dispatched_file(mean SSE2 AVX2 LASX)
ocv_add_dispatched_file(merge SSE2 AVX2 LASX)
ocv_add_dispatched_file(split SSE2 AVX2 LASX)
ocv_add_dispatched_file(sum SSE2 AVX2 LASX)
# dispatching for accuracy tests
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX LASX)
ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)

@ -246,12 +246,6 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#include "opencv2/core/hal/intrin_lsx.hpp"
#elif CV_LASX
#if !defined(CV_FORCE_SIMD128_CPP)
#define CV_FORCE_SIMD128_CPP 1
#endif
#include "opencv2/core/hal/intrin_cpp.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"

@ -1419,20 +1419,6 @@ inline v_uint32x8 v_popcount(const v_int32x8& a)
inline v_uint64x4 v_popcount(const v_int64x4& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
/** Mask **/
#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
inline int v_signmask(const v_int8x32& a)
{
__m256i result = __lasx_xvmskltz_b(a.val);
@ -2151,7 +2137,8 @@ template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
{
__m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
template<int n> inline
@ -2165,7 +2152,8 @@ template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
{
__m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
template<int n> inline
@ -2179,7 +2167,8 @@ template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
{
__m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
// 32
@ -2198,7 +2187,8 @@ inline void v_pack_store(short* ptr, const v_int32x8& a)
inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
{
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
@ -2212,7 +2202,8 @@ template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
{
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
template<int n> inline
@ -2223,7 +2214,8 @@ template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
{
__m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
template<int n> inline
@ -2234,7 +2226,8 @@ template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
{
__m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
// 64
@ -2263,7 +2256,11 @@ v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); }
{
__m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
template<int n> inline
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
@ -2271,7 +2268,11 @@ v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); }
{
__m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
__lasx_xvstelm_d(res, ptr, 0, 0);
__lasx_xvstelm_d(res, ptr, 8, 2);
}
// pack boolean
inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)

@ -154,6 +154,12 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
# endif
#endif
#if defined __loongarch64
#include "sys/auxv.h"
#define LA_HWCAP_LSX (1<<4)
#define LA_HWCAP_LASX (1<<5)
#endif
#if defined _WIN32 || defined WINCE
#ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
#define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
@ -704,12 +710,11 @@ struct HWFeatures
have[CV_CPU_RVV] = true;
#endif
#if defined __loongarch_sx
have[CV_CPU_LSX] = true;
#endif
#if defined __loongarch64 && defined __linux__
int flag = (int)getauxval(AT_HWCAP);
#if defined __loongarch_asx
have[CV_CPU_LASX] = true;
have[CV_CPU_LSX] = (flag & LA_HWCAP_LSX) != 0;
have[CV_CPU_LASX] = (flag & LA_HWCAP_LASX) != 0;
#endif
bool skip_baseline_check = false;

Loading…
Cancel
Save