diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index cc57cbdcae..04f10aee28 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -2,7 +2,7 @@ # SSE / SSE2 (always available on 64-bit CPUs) # SSE3 / SSSE3 # SSE4_1 / SSE4_2 / POPCNT -# AVX / AVX2 / AVX512 +# AVX / AVX2 / AVX_512F # FMA3 # CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag) @@ -26,7 +26,7 @@ # # CPU_DISPATCH_FLAGS_${opt} - flags for source files compiled separately (.avx2.cpp) -set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX512") +set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F") list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) list(APPEND CPU_ALL_OPTIMIZATIONS VSX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -145,7 +145,7 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ") endif() if(X86 OR X86_64) - ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX512") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F") ocv_update(CPU_SSE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") ocv_update(CPU_SSE2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") @@ -157,11 +157,11 @@ if(X86 OR X86_64) ocv_update(CPU_AVX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") ocv_update(CPU_AVX2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") - ocv_update(CPU_AVX512_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp") + ocv_update(CPU_AVX_512F_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp") if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE) - ocv_update(CPU_AVX512_IMPLIES "AVX2") - ocv_update(CPU_AVX512_FORCE "") # Don't force other optimizations + ocv_update(CPU_AVX_512F_IMPLIES "AVX2") + ocv_update(CPU_AVX_512F_FORCE "") # Don't force other optimizations ocv_update(CPU_AVX2_IMPLIES "AVX;FMA3;FP16") ocv_update(CPU_FMA3_IMPLIES "AVX2") ocv_update(CPU_FMA3_FORCE "") # Don't force other optimizations @@ -205,7 +205,7 @@ if(X86 OR X86_64) if(NOT X86_64) # x64 compiler doesn't support /arch:sse ocv_intel_compiler_optimization_option(SSE "-msse" "/arch:SSE") endif() - #ocv_intel_compiler_optimization_option(AVX512 "-march=core-avx512") + ocv_intel_compiler_optimization_option(AVX_512F "-march=common-avx512" "/arch:COMMON-AVX512") elseif(CMAKE_COMPILER_IS_GNUCXX) ocv_update(CPU_AVX2_FLAGS_ON "-mavx2") ocv_update(CPU_FP16_FLAGS_ON "-mf16c") @@ -219,7 +219,8 @@ if(X86 OR X86_64) ocv_update(CPU_SSE2_FLAGS_ON "-msse2") ocv_update(CPU_SSE_FLAGS_ON "-msse") if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") - ocv_update(CPU_AVX512_FLAGS_ON "-mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi") + # -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi + ocv_update(CPU_AVX_512F_FLAGS_ON "-mavx512f") endif() elseif(MSVC) ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2") diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 75f6ca9c7d..5df7b8b4ef 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -82,9 +82,9 @@ # include # define CV_AVX2 1 #endif -#ifdef CV_CPU_COMPILE_AVX512 +#ifdef CV_CPU_COMPILE_AVX_512F # include -# define CV_AVX512 1 +# define CV_AVX_512F 1 #endif #ifdef CV_CPU_COMPILE_FMA3 # define CV_FMA3 1 diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 1c7dbaf852..1b939a0a19 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -165,20 +165,20 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) -#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512 -# define CV_TRY_AVX512 1 -# define CV_CPU_HAS_SUPPORT_AVX512 1 -# define CV_CPU_CALL_AVX512(fn, args) return (opt_AVX512::fn args) -#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512 -# define CV_TRY_AVX512 1 -# define CV_CPU_HAS_SUPPORT_AVX512 (cv::checkHardwareSupport(CV_CPU_AVX512)) -# define CV_CPU_CALL_AVX512(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512) return (opt_AVX512::fn args) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX_512F +# define CV_TRY_AVX_512F 1 +# define CV_CPU_HAS_SUPPORT_AVX_512F 1 +# define CV_CPU_CALL_AVX_512F(fn, args) return (opt_AVX_512F::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX_512F +# define CV_TRY_AVX_512F 1 +# define CV_CPU_HAS_SUPPORT_AVX_512F (cv::checkHardwareSupport(CV_CPU_AVX_512F)) +# define CV_CPU_CALL_AVX_512F(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args) #else -# define CV_TRY_AVX512 0 -# define CV_CPU_HAS_SUPPORT_AVX512 0 -# define CV_CPU_CALL_AVX512(fn, args) +# define CV_TRY_AVX_512F 0 +# define CV_CPU_HAS_SUPPORT_AVX_512F 0 +# define CV_CPU_CALL_AVX_512F(fn, args) #endif -#define __CV_CPU_DISPATCH_CHAIN_AVX512(fn, args, mode, ...) CV_CPU_CALL_AVX512(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...) CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON # define CV_TRY_NEON 1 diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 74723beb46..fbfc87b51f 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -13,7 +13,7 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") -ocv_add_dispatched_file("layers/layers_common" AVX AVX2 AVX512) +ocv_add_dispatched_file("layers/layers_common" AVX AVX2 AVX_512F) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab java js) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index c1ce08d1f7..f533962f38 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -384,7 +384,7 @@ public: p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0); p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); - p.useAVX512 = checkHardwareSupport(CPU_AVX_512DQ); + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; int ncn = std::min(inpCn, (int)BLK_SIZE_CN); p.ofstab_.resize(kernel.width*kernel.height*ncn); @@ -564,10 +564,10 @@ public: // now compute dot product of the weights // and im2row-transformed part of the tensor int bsz = ofs1 - ofs0; - #if CV_TRY_AVX512 + #if CV_TRY_AVX_512F /* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */ if(useAVX512) - opt_AVX512::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + opt_AVX_512F::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else #endif @@ -1102,7 +1102,7 @@ public: nstripes_ = nstripes; useAVX = checkHardwareSupport(CPU_AVX); useAVX2 = checkHardwareSupport(CPU_AVX2); - useAVX512 = checkHardwareSupport(CPU_AVX_512DQ); + useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; } void operator()(const Range& range_) const @@ -1120,9 +1120,9 @@ public: size_t bstep = b_->step1(); size_t cstep = c_->step1(); - #if CV_TRY_AVX512 + #if CV_TRY_AVX_512F if( useAVX512 ) - opt_AVX512::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + opt_AVX_512F::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else #endif #if CV_TRY_AVX2 diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index cee611a207..88279d23dd 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -161,7 +161,7 @@ public: p.activ = activ; p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); - p.useAVX512 = checkHardwareSupport(CPU_AVX_512DQ); + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX_512F; parallel_for_(Range(0, nstripes), p, nstripes); } @@ -196,9 +196,9 @@ public: memcpy(sptr, sptr_, vecsize*sizeof(sptr[0])); - #if CV_TRY_AVX512 + #if CV_TRY_AVX_512F if( useAVX512 ) - opt_AVX512::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX_512F::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else #endif #if CV_TRY_AVX2 diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index 7354caa689..a480426ba6 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -301,7 +301,7 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr, { int n = 0; -#ifdef CV_AVX512 +#if CV_AVX_512F for( ; n <= nb - 32; n += 32 ) { for( int m = 0; m < ma; m += 4 )