diff --git a/CMakeLists.txt b/CMakeLists.txt index f9a8d46fd4..fde8b578ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -327,7 +327,6 @@ OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OFF IF CV_GCC ) OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CV_GCC ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) -OCV_OPTION(ENABLE_VSX "Enable POWER8 and above VSX (64-bit little-endian)" ON IF ((CV_GCC OR CV_CLANG) AND PPC64LE) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CV_GCC AND (X86 OR X86_64)) ) if(NOT IOS) # Use CPU_BASELINE instead OCV_OPTION(ENABLE_NEON "Enable NEON instructions" (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) ) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 7deb127e65..4c115f0e53 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -5,6 +5,10 @@ # AVX / AVX2 / AVX_512F # FMA3 +# ppc64le arch: +# VSX (always available on Power8) +# VSX3 (always available on Power9) + # CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag) # CPU_{opt}_IMPLIES= # CPU_{opt}_FORCE= - subset of "implies" list @@ -29,7 +33,7 @@ set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_SKX") list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) -list(APPEND CPU_ALL_OPTIMIZATIONS VSX) +list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) ocv_update(CPU_VFPV3_FEATURE_ALIAS "") @@ -81,7 +85,7 @@ ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON) ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF) ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF) -ocv_optimization_process_obsolete_option(ENABLE_VSX VSX OFF) +ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON) macro(ocv_is_optimization_in_list resultvar check_opt) set(__checked "") @@ -289,14 +293,24 @@ elseif(ARM OR AARCH64) set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") endif() elseif(PPC64LE) - ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3") ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp") + ocv_update(CPU_VSX3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx3.cpp") + + if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE) + ocv_update(CPU_VSX3_IMPLIES "VSX") + endif() if(CV_CLANG AND (NOT ${CMAKE_CXX_COMPILER} MATCHES "xlc")) ocv_update(CPU_VSX_FLAGS_ON "-mvsx -maltivec") + ocv_update(CPU_VSX3_FLAGS_ON "-mpower9-vector") else() ocv_update(CPU_VSX_FLAGS_ON "-mcpu=power8") + ocv_update(CPU_VSX3_FLAGS_ON "-mcpu=power9 -mtune=power9") endif() + + set(CPU_DISPATCH "VSX3" CACHE STRING "${HELP_CPU_DISPATCH}") + set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}") endif() # Helper values for cmake-gui diff --git a/cmake/checks/cpu_vsx.cpp b/cmake/checks/cpu_vsx.cpp index 6d744825b6..8d0cd574ce 100644 --- a/cmake/checks/cpu_vsx.cpp +++ b/cmake/checks/cpu_vsx.cpp @@ -1,8 +1,12 @@ -# if defined(__VSX__) -# include -# else -# error "VSX is not supported" -# endif +#if defined(__VSX__) + #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) + #include + #else + #error "OpenCV only supports little-endian mode" + #endif +#else + #error "VSX is not supported" +#endif int main() { diff --git a/cmake/checks/cpu_vsx3.cpp b/cmake/checks/cpu_vsx3.cpp new file mode 100644 index 0000000000..31d7014a7c --- /dev/null +++ b/cmake/checks/cpu_vsx3.cpp @@ -0,0 +1,17 @@ +#if defined(__VSX__) + #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) + #include + #else + #error "OpenCV only supports little-endian mode" + #endif +#else + #error "VSX3 is not supported" +#endif + +int main() +{ + __vector unsigned char a = vec_splats((unsigned char)1); + __vector unsigned char b = vec_splats((unsigned char)2); + __vector unsigned char r = vec_absd(a, b); + return 0; +} diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index e31900e601..992d09de0b 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -2,7 +2,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) ocv_add_dispatched_file(stat SSE4_2 AVX2) -ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) # dispatching for accuracy tests ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 4c686af97f..57aa0ce2fb 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -107,7 +107,7 @@ # include #endif -#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +#ifdef CV_CPU_COMPILE_VSX # include # undef vector # undef pixel @@ -115,6 +115,10 @@ # define CV_VSX 1 #endif +#ifdef CV_CPU_COMPILE_VSX3 +# define CV_VSX3 1 +#endif + #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX @@ -237,3 +241,7 @@ struct VZeroUpperGuard { #ifndef CV_VSX # define CV_VSX 0 #endif + +#ifndef CV_VSX3 +# define CV_VSX3 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 84f489bab4..ad1339796d 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -315,5 +315,26 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...) CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3 +# define CV_TRY_VSX3 1 +# define CV_CPU_FORCE_VSX3 1 +# define CV_CPU_HAS_SUPPORT_VSX3 1 +# define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3 +# define CV_TRY_VSX3 1 +# define CV_CPU_FORCE_VSX3 0 +# define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3)) +# define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args) +# define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args) +#else +# define CV_TRY_VSX3 0 +# define CV_CPU_FORCE_VSX3 0 +# define CV_CPU_HAS_SUPPORT_VSX3 0 +# define CV_CPU_CALL_VSX3(fn, args) +# define CV_CPU_CALL_VSX3_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...) CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index f8df11a500..1e8a697e90 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -240,9 +240,10 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_AVX_512VBMI 20 #define CV_CPU_AVX_512VL 21 -#define CV_CPU_NEON 100 +#define CV_CPU_NEON 100 -#define CV_CPU_VSX 200 +#define CV_CPU_VSX 200 +#define CV_CPU_VSX3 201 // CPU features groups #define CV_CPU_AVX512_SKX 256 @@ -280,6 +281,7 @@ enum CpuFeatures { CPU_NEON = 100, CPU_VSX = 200, + CPU_VSX3 = 201, CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 4ff839ad47..c1754ac763 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -107,15 +107,14 @@ void* allocSingletonBuffer(size_t size) { return fastMalloc(size); } # include #endif -#ifndef __VSX__ -# if defined __PPC64__ && defined __linux__ -# include "sys/auxv.h" -# ifndef AT_HWCAP2 -# define AT_HWCAP2 26 -# endif -# ifndef PPC_FEATURE2_ARCH_2_07 -# define PPC_FEATURE2_ARCH_2_07 0x80000000 -# endif + +#if CV_VSX && defined __linux__ +# include "sys/auxv.h" +# ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +# endif +# ifndef PPC_FEATURE2_ARCH_3_00 +# define PPC_FEATURE2_ARCH_3_00 0x00800000 # endif #endif @@ -359,6 +358,7 @@ struct HWFeatures g_hwFeatureNames[CPU_NEON] = "NEON"; g_hwFeatureNames[CPU_VSX] = "VSX"; + g_hwFeatureNames[CPU_VSX3] = "VSX3"; g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX"; } @@ -513,14 +513,14 @@ struct HWFeatures #endif #endif - #ifdef __VSX__ - have[CV_CPU_VSX] = true; - #elif (defined __PPC64__ && defined __linux__) - uint64 hwcaps = getauxval(AT_HWCAP); + // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs + have[CV_CPU_VSX] = (CV_VSX); + // TODO: Check VSX3 availability in runtime for other platforms + #if CV_VSX && defined __linux__ uint64 hwcap2 = getauxval(AT_HWCAP2); - have[CV_CPU_VSX] = (hwcaps & PPC_FEATURE_PPC_LE && hwcaps & PPC_FEATURE_HAS_VSX && hwcap2 & PPC_FEATURE2_ARCH_2_07); + have[CV_CPU_VSX3] = (hwcap2 & PPC_FEATURE2_ARCH_3_00); #else - have[CV_CPU_VSX] = false; + have[CV_CPU_VSX3] = (CV_VSX3); #endif int baseline_features[] = { CV_CPU_BASELINE_FEATURES };