diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index b849f02b14..9b48dc857f 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -275,6 +275,11 @@ set(CPU_BASELINE_FLAGS "") set(CPU_BASELINE_FINAL "") set(CPU_DISPATCH_FINAL "") +if(CV_DISABLE_OPTIMIZATION) + set(CPU_DISPATCH "") + set(CPU_DISPATCH_REQUIRE "") +endif() + macro(ocv_check_compiler_optimization OPT) if(NOT DEFINED CPU_${OPT}_SUPPORTED) if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE) @@ -319,7 +324,7 @@ macro(ocv_check_compiler_optimization OPT) endmacro() foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) - set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "" FORCE) + set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "") if(NOT DEFINED CPU_${OPT}_FORCE) set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}") endif() @@ -515,15 +520,27 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T endforeach() foreach(fname ${${SOURCES_VAR_NAME}}) string(TOLOWER "${fname}" fname_LOWER) - if(fname_LOWER MATCHES "[.]opt_.*[.]cpp$") - if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS) - message(STATUS "Excluding from source files list: ${fname}") + if(fname_LOWER MATCHES "\\.(.*)\\.cpp$") + string(TOUPPER "${CMAKE_MATCH_1}" OPT_) + if(OPT_ MATCHES "(CUDA.*|DISPATCH.*|OCL)") # don't touch files like filename.cuda.cpp + list(APPEND __result "${fname}") + #continue() + elseif(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS) + message(STATUS "Excluding from source files list (optimization is disabled): ${fname}") #continue() else() + get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS) + if(__definitions) + list(APPEND __definitions "CV_CPU_DISPATCH_MODE=${OPT_}") + else() + set(__definitions "CV_CPU_DISPATCH_MODE=${OPT_}") + endif() + set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}") + set(__opt_found 0) foreach(OPT ${CPU_BASELINE_FINAL}) string(TOLOWER "${OPT}" OPT_LOWER) - if(fname_LOWER MATCHES "_${OPT_LOWER}[.]cpp$") + if(fname_LOWER MATCHES "\\.${OPT_LOWER}\\.cpp$") #message("${fname} BASELINE-${OPT}") set(__opt_found 1) list(APPEND __result "${fname}") @@ -533,11 +550,11 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T foreach(OPT ${CPU_DISPATCH_FINAL}) foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED}) string(TOLOWER "${OPT2}" OPT2_LOWER) - if(fname_LOWER MATCHES "_${OPT2_LOWER}[.]cpp$") + if(fname_LOWER MATCHES "\\.${OPT2_LOWER}\\.cpp$") list(APPEND __result_${OPT} "${fname}") math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1") set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE) -#message("${fname} ${OPT}") +#message("(${CPU_${OPT}_USAGE_COUNT})${fname} ${OPT}") #message(" ${CPU_DISPATCH_${OPT}_INCLUDED}") #message(" ${CPU_DISPATCH_DEFINITIONS_${OPT}}") #message(" ${CPU_DISPATCH_FLAGS_${OPT}}") @@ -573,7 +590,13 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T list(APPEND __result "$") else() foreach(fname ${__result_${OPT}}) - set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}") + get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS) + if(__definitions) + list(APPEND __definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}") + else() + set(__definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}") + endif() + set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}") set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}") endforeach() list(APPEND __result ${__result_${OPT}}) @@ -620,18 +643,25 @@ macro(ocv_compiler_optimization_fill_cpu_config) set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE} #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT} # define CV_CPU_HAS_SUPPORT_${OPT} 1 -# define CV_CPU_CALL_${OPT}(...) return __VA_ARGS__ +# define CV_CPU_CALL_${OPT}(fn, args) return (opt_${OPT}::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT} # define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT})) -# define CV_CPU_CALL_${OPT}(...) if (CV_CPU_HAS_SUPPORT_${OPT}) return __VA_ARGS__ +# define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args) #else # define CV_CPU_HAS_SUPPORT_${OPT} 0 -# define CV_CPU_CALL_${OPT}(...) +# define CV_CPU_CALL_${OPT}(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_${OPT}(fn, args, mode, ...) CV_CPU_CALL_${OPT}(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) ") endif() endforeach() + set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE} +#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) +#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ +") + + set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h") if(EXISTS "${__file}") file(READ "${__file}" __content) @@ -644,6 +674,57 @@ macro(ocv_compiler_optimization_fill_cpu_config) endif() endmacro() +macro(ocv_add_dispatched_file filename) + if(NOT OPENCV_INITIAL_PASS) + set(__codestr " +#include \"precomp.hpp\" +#include \"${filename}.simd.hpp\" +") + + set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${filename}.simd.hpp\"") + set(__dispatch_modes "BASELINE") + + set(__optimizations "${ARGN}") + if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS) + set(__optimizations "") + endif() + + foreach(OPT ${__optimizations}) + string(TOLOWER "${OPT}" OPT_LOWER) + set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${OPT_LOWER}.cpp") + if(EXISTS "${__file}") + file(READ "${__file}" __content) + endif() + if(__content STREQUAL __codestr) + #message(STATUS "${__file} contains up-to-date content") + else() + file(WRITE "${__file}" "${__codestr}") + endif() + list(APPEND OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED "${__file}") + + set(__declarations_str "${__declarations_str} +#define CV_CPU_DISPATCH_MODE ${OPT} +#include \"opencv2/core/private/cv_cpu_include_simd_declarations.hpp\" +") + set(__dispatch_modes "${OPT}, ${__dispatch_modes}") + endforeach() + + set(__declarations_str "${__declarations_str} +#define CV_CPU_DISPATCH_MODES_ALL ${__dispatch_modes} +") + + set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.simd_declarations.hpp") + if(EXISTS "${__file}") + file(READ "${__file}" __content) + endif() + if(__content STREQUAL __declarations_str) + #message(STATUS "${__file} contains up-to-date content") + else() + file(WRITE "${__file}" "${__declarations_str}") + endif() + endif() +endmacro() + if(CV_DISABLE_OPTIMIZATION OR CV_ICC) ocv_update(CV_ENABLE_UNROLLED 0) else() diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 10e1f7397c..2546ac5839 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -314,6 +314,7 @@ macro(ocv_glob_modules) set(OPENCV_INITIAL_PASS OFF) if(${BUILD_opencv_world}) foreach(m ${OPENCV_MODULES_BUILD}) + set(the_module "${m}") if("${m}" STREQUAL opencv_world) add_subdirectory("${OPENCV_MODULE_opencv_world_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/world") elseif(NOT OPENCV_MODULE_${m}_IS_PART_OF_WORLD AND NOT ${m} STREQUAL opencv_world) @@ -329,6 +330,7 @@ macro(ocv_glob_modules) endforeach() else() foreach(m ${OPENCV_MODULES_BUILD}) + set(the_module "${m}") if(m MATCHES "^opencv_") string(REGEX REPLACE "^opencv_" "" __shortname "${m}") add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${__shortname}") @@ -646,11 +648,13 @@ macro(ocv_set_module_sources) ocv_get_module_external_sources() endif() + if(OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED) + list(APPEND OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED}) + endif() + # use full paths for module to be independent from the module location ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS) - ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module}) - set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}") set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}") endmacro() @@ -766,6 +770,11 @@ macro(ocv_create_module) endmacro() macro(_ocv_create_module) + + ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module}) + set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}") + set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}") + # The condition we ought to be testing here is whether ocv_add_precompiled_headers will # be called at some point in the future. We can't look into the future, though, # so this will have to do. diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake index 6a83218729..659973af81 100644 --- a/cmake/OpenCVPCHSupport.cmake +++ b/cmake/OpenCVPCHSupport.cmake @@ -288,11 +288,12 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input) foreach(src ${_sources}) if(NOT "${src}" MATCHES "\\.mm$") get_source_file_property(oldProps "${src}" COMPILE_FLAGS) - if(NOT oldProps) + get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS) + if(NOT oldProps AND NOT oldProps2) set(newProperties "-include \"${CMAKE_CURRENT_BINARY_DIR}/${_name}\"") set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}") else() - ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}") + ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}") endif() endif() endforeach() @@ -339,11 +340,12 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input) AND NOT "${src}" MATCHES "^\$" # CMake generator expressions ) get_source_file_property(oldProps "${src}" COMPILE_FLAGS) - if(NOT oldProps) + get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS) + if(NOT oldProps AND NOT oldProps2) set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"") set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}") else() - ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}") + ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}") endif() endif() endforeach() diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 01b272e97e..3d2cc7f594 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,4 +1,7 @@ set(the_description "The Core Functionality") + +ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) + ocv_add_module(core "${OPENCV_HAL_LINKER_LIBS}" OPTIONAL opencv_cudev diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 9a8537f909..aaabea38d4 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -7,6 +7,23 @@ #include "cv_cpu_config.h" #include "cv_cpu_helper.h" +#ifdef CV_CPU_DISPATCH_MODE +#define CV_CPU_OPTIMIZATION_NAMESPACE __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) +#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) { +#define CV_CPU_OPTIMIZATION_NAMESPACE_END } +#else +#define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline +#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline { +#define CV_CPU_OPTIMIZATION_NAMESPACE_END } +#endif + + +#define __CV_CPU_DISPATCH_CHAIN_END(fn, args, mode, ...) /* done */ +#define __CV_CPU_DISPATCH(fn, args, mode, ...) __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#define __CV_CPU_DISPATCH_EXPAND(fn, args, ...) __CV_EXPAND(__CV_CPU_DISPATCH(fn, args, __VA_ARGS__)) +#define CV_CPU_DISPATCH(fn, args, ...) __CV_CPU_DISPATCH_EXPAND(fn, args, __VA_ARGS__, END) // expand macros + + #if defined CV_ENABLE_INTRINSICS \ && !defined CV_DISABLE_OPTIMIZATION \ && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \ @@ -76,6 +93,16 @@ #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ +#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX +struct VZeroUpperGuard { +#ifdef __GNUC__ + __attribute__((always_inline)) +#endif + inline ~VZeroUpperGuard() { _mm256_zeroupper(); } +}; +#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; +#endif + #endif // __OPENCV_BUILD diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index cb755d615e..8bd0457242 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -2,132 +2,147 @@ #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE # define CV_CPU_HAS_SUPPORT_SSE 1 -# define CV_CPU_CALL_SSE(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE # define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE)) -# define CV_CPU_CALL_SSE(...) if (CV_CPU_HAS_SUPPORT_SSE) return __VA_ARGS__ +# define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args) #else # define CV_CPU_HAS_SUPPORT_SSE 0 -# define CV_CPU_CALL_SSE(...) +# define CV_CPU_CALL_SSE(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...) CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2 # define CV_CPU_HAS_SUPPORT_SSE2 1 -# define CV_CPU_CALL_SSE2(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2 # define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2)) -# define CV_CPU_CALL_SSE2(...) if (CV_CPU_HAS_SUPPORT_SSE2) return __VA_ARGS__ +# define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args) #else # define CV_CPU_HAS_SUPPORT_SSE2 0 -# define CV_CPU_CALL_SSE2(...) +# define CV_CPU_CALL_SSE2(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...) CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3 # define CV_CPU_HAS_SUPPORT_SSE3 1 -# define CV_CPU_CALL_SSE3(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3 # define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3)) -# define CV_CPU_CALL_SSE3(...) if (CV_CPU_HAS_SUPPORT_SSE3) return __VA_ARGS__ +# define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args) #else # define CV_CPU_HAS_SUPPORT_SSE3 0 -# define CV_CPU_CALL_SSE3(...) +# define CV_CPU_CALL_SSE3(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...) CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3 # define CV_CPU_HAS_SUPPORT_SSSE3 1 -# define CV_CPU_CALL_SSSE3(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3 # define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3)) -# define CV_CPU_CALL_SSSE3(...) if (CV_CPU_HAS_SUPPORT_SSSE3) return __VA_ARGS__ +# define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args) #else # define CV_CPU_HAS_SUPPORT_SSSE3 0 -# define CV_CPU_CALL_SSSE3(...) +# define CV_CPU_CALL_SSSE3(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...) CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1 # define CV_CPU_HAS_SUPPORT_SSE4_1 1 -# define CV_CPU_CALL_SSE4_1(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1 # define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1)) -# define CV_CPU_CALL_SSE4_1(...) if (CV_CPU_HAS_SUPPORT_SSE4_1) return __VA_ARGS__ +# define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args) #else # define CV_CPU_HAS_SUPPORT_SSE4_1 0 -# define CV_CPU_CALL_SSE4_1(...) +# define CV_CPU_CALL_SSE4_1(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...) CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2 # define CV_CPU_HAS_SUPPORT_SSE4_2 1 -# define CV_CPU_CALL_SSE4_2(...) return __VA_ARGS__ +# define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2 # define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2)) -# define CV_CPU_CALL_SSE4_2(...) if (CV_CPU_HAS_SUPPORT_SSE4_2) return __VA_ARGS__ +# define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args) #else # define CV_CPU_HAS_SUPPORT_SSE4_2 0 -# define CV_CPU_CALL_SSE4_2(...) +# define CV_CPU_CALL_SSE4_2(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...) CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT # define CV_CPU_HAS_SUPPORT_POPCNT 1 -# define CV_CPU_CALL_POPCNT(...) return __VA_ARGS__ +# define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT # define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT)) -# define CV_CPU_CALL_POPCNT(...) if (CV_CPU_HAS_SUPPORT_POPCNT) return __VA_ARGS__ +# define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args) #else # define CV_CPU_HAS_SUPPORT_POPCNT 0 -# define CV_CPU_CALL_POPCNT(...) +# define CV_CPU_CALL_POPCNT(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...) CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX # define CV_CPU_HAS_SUPPORT_AVX 1 -# define CV_CPU_CALL_AVX(...) return __VA_ARGS__ +# define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX # define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX)) -# define CV_CPU_CALL_AVX(...) if (CV_CPU_HAS_SUPPORT_AVX) return __VA_ARGS__ +# define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args) #else # define CV_CPU_HAS_SUPPORT_AVX 0 -# define CV_CPU_CALL_AVX(...) +# define CV_CPU_CALL_AVX(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...) CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16 # define CV_CPU_HAS_SUPPORT_FP16 1 -# define CV_CPU_CALL_FP16(...) return __VA_ARGS__ +# define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16 # define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16)) -# define CV_CPU_CALL_FP16(...) if (CV_CPU_HAS_SUPPORT_FP16) return __VA_ARGS__ +# define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args) #else # define CV_CPU_HAS_SUPPORT_FP16 0 -# define CV_CPU_CALL_FP16(...) +# define CV_CPU_CALL_FP16(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...) CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2 # define CV_CPU_HAS_SUPPORT_AVX2 1 -# define CV_CPU_CALL_AVX2(...) return __VA_ARGS__ +# define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2 # define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2)) -# define CV_CPU_CALL_AVX2(...) if (CV_CPU_HAS_SUPPORT_AVX2) return __VA_ARGS__ +# define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args) #else # define CV_CPU_HAS_SUPPORT_AVX2 0 -# define CV_CPU_CALL_AVX2(...) +# define CV_CPU_CALL_AVX2(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...) CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3 # define CV_CPU_HAS_SUPPORT_FMA3 1 -# define CV_CPU_CALL_FMA3(...) return __VA_ARGS__ +# define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3 # define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3)) -# define CV_CPU_CALL_FMA3(...) if (CV_CPU_HAS_SUPPORT_FMA3) return __VA_ARGS__ +# define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args) #else # define CV_CPU_HAS_SUPPORT_FMA3 0 -# define CV_CPU_CALL_FMA3(...) +# define CV_CPU_CALL_FMA3(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON # define CV_CPU_HAS_SUPPORT_NEON 1 -# define CV_CPU_CALL_NEON(...) return __VA_ARGS__ +# define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON # define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON)) -# define CV_CPU_CALL_NEON(...) if (CV_CPU_HAS_SUPPORT_NEON) return __VA_ARGS__ +# define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args) #else # define CV_CPU_HAS_SUPPORT_NEON 0 -# define CV_CPU_CALL_NEON(...) +# define CV_CPU_CALL_NEON(fn, args) #endif +#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + +#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) +#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index f2212b4217..43f8a02a72 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -52,6 +52,17 @@ #include "cvconfig.h" #endif +#ifndef __CV_EXPAND +#define __CV_EXPAND(x) x +#endif + +#ifndef __CV_CAT +#define __CV_CAT__(x, y) x ## y +#define __CV_CAT_(x, y) __CV_CAT__(x, y) +#define __CV_CAT(x, y) __CV_CAT_(x, y) +#endif + + #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300 # define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */ #endif diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 34075e3830..9dd1514ea3 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -60,6 +60,25 @@ // access from within opencv code more accessible namespace cv { +#ifndef CV_DOXYGEN + +#ifdef CV_CPU_DISPATCH_MODE +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) { +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } +#else +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline { +#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } +#endif + + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +#endif + //! @addtogroup core_hal_intrin //! @{ @@ -281,6 +300,9 @@ template struct V_SIMD128Traits //! @} +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +#endif } #ifdef CV_DOXYGEN @@ -323,6 +345,10 @@ template struct V_SIMD128Traits namespace cv { +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +#endif + template struct V_RegTrait128; template <> struct V_RegTrait128 { @@ -407,6 +433,10 @@ template <> struct V_RegTrait128 { }; #endif +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +#endif + } // cv:: //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index e15c97d528..e8c166405e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -53,6 +53,10 @@ namespace cv { +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +#endif + /** @addtogroup core_hal_intrin "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on @@ -1827,7 +1831,9 @@ static inline bool hasSIMD128() //! @} - +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +#endif } #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 2bcff2bc15..c7f4e90a36 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -53,6 +53,8 @@ namespace cv //! @cond IGNORED +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + #define CV_SIMD128 1 #if defined(__aarch64__) #define CV_SIMD128_64F 1 @@ -1238,11 +1240,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) //! @brief Check CPU capability of SIMD operation static inline bool hasSIMD128() { - return checkHardwareSupport(CV_CPU_NEON); + return (CV_CPU_HAS_SUPPORT_NEON) ? true : false; } //! @} +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + //! @endcond } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 60003082ea..5b9a27fd8a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -56,6 +56,8 @@ namespace cv //! @cond IGNORED +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + struct v_uint8x16 { typedef uchar lane_type; @@ -1791,11 +1793,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) //! @brief Check CPU capability of SIMD operation static inline bool hasSIMD128() { - return checkHardwareSupport(CV_CPU_SSE2); + return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false; } //! @} +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + //! @endcond } diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index e428ecf8ad..afbc557166 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -540,7 +540,7 @@ CV_EXPORTS InstrNode* getCurrentNode(); ///// General instrumentation // General OpenCV region instrumentation macro -#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN) +#define CV_INSTRUMENT_REGION_() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN) // Custom OpenCV region instrumentation macro #define CV_INSTRUMENT_REGION_NAME(NAME) CV_INSTRUMENT_REGION_CUSTOM_META(NAME, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN) // Instrumentation for parallel_for_ or other regions which forks and gathers threads @@ -566,7 +566,7 @@ CV_EXPORTS InstrNode* getCurrentNode(); #else #define CV_INSTRUMENT_REGION_META(...) -#define CV_INSTRUMENT_REGION() +#define CV_INSTRUMENT_REGION_() #define CV_INSTRUMENT_REGION_NAME(...) #define CV_INSTRUMENT_REGION_MT_FORK() @@ -580,6 +580,12 @@ CV_EXPORTS InstrNode* getCurrentNode(); #define CV_INSTRUMENT_MARK_OPENCL(...) #endif +#ifdef __CV_AVX_GUARD +#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_() +#else +#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_() +#endif + //! @endcond #endif // OPENCV_CORE_PRIVATE_HPP diff --git a/modules/core/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp b/modules/core/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp new file mode 100644 index 0000000000..37d06efcd7 --- /dev/null +++ b/modules/core/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp @@ -0,0 +1,30 @@ +// Helper file to include dispatched functions declaration: +// +// Usage: +// #define CV_CPU_SIMD_FILENAME ".simd.hpp" +// #define CV_CPU_DISPATCH_MODE AVX2 +// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" +// #define CV_CPU_DISPATCH_MODE SSE2 +// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" + +#ifndef CV_DISABLE_OPTIMIZATION +#ifdef _MSC_VER +#pragma warning(disable: 4702) // unreachable code +#endif +#endif + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#endif + +#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +#undef CV_CPU_OPTIMIZATION_NAMESPACE_END + +#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) { +#define CV_CPU_OPTIMIZATION_NAMESPACE_END } + +#include CV_CPU_SIMD_FILENAME + +#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +#undef CV_CPU_OPTIMIZATION_NAMESPACE_END +#undef CV_CPU_DISPATCH_MODE diff --git a/modules/core/src/mathfuncs_core.dispatch.cpp b/modules/core/src/mathfuncs_core.dispatch.cpp new file mode 100644 index 0000000000..1a462a0635 --- /dev/null +++ b/modules/core/src/mathfuncs_core.dispatch.cpp @@ -0,0 +1,215 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" + +#include "mathfuncs_core.simd.hpp" +#include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { namespace hal { + +///////////////////////////////////// ATAN2 //////////////////////////////////// + +void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees); + + CV_CPU_DISPATCH(fastAtan32f, (Y, X, angle, len, angleInDegrees), + CV_CPU_DISPATCH_MODES_ALL); +} + +void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees); + + CV_CPU_DISPATCH(fastAtan64f, (Y, X, angle, len, angleInDegrees), + CV_CPU_DISPATCH_MODES_ALL); +} + +// deprecated +void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) +{ + CV_INSTRUMENT_REGION() + + fastAtan32f(Y, X, angle, len, angleInDegrees); +} + +void magnitude32f(const float* x, const float* y, float* mag, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0); + + CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len), + CV_CPU_DISPATCH_MODES_ALL); +} + +void magnitude64f(const double* x, const double* y, double* mag, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0); + + CV_CPU_DISPATCH(magnitude64f, (x, y, mag, len), + CV_CPU_DISPATCH_MODES_ALL); +} + + +void invSqrt32f(const float* src, float* dst, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0); + + CV_CPU_DISPATCH(invSqrt32f, (src, dst, len), + CV_CPU_DISPATCH_MODES_ALL); +} + + +void invSqrt64f(const double* src, double* dst, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0); + + CV_CPU_DISPATCH(invSqrt64f, (src, dst, len), + CV_CPU_DISPATCH_MODES_ALL); +} + + +void sqrt32f(const float* src, float* dst, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0); + + CV_CPU_DISPATCH(sqrt32f, (src, dst, len), + CV_CPU_DISPATCH_MODES_ALL); +} + + +void sqrt64f(const double* src, double* dst, int len) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0); + + CV_CPU_DISPATCH(sqrt64f, (src, dst, len), + CV_CPU_DISPATCH_MODES_ALL); +} + +void exp32f(const float *src, float *dst, int n) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0); + + CV_CPU_DISPATCH(exp32f, (src, dst, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +void exp64f(const double *src, double *dst, int n) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0); + + CV_CPU_DISPATCH(exp64f, (src, dst, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +void log32f(const float *src, float *dst, int n) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(log32f, cv_hal_log32f, src, dst, n); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0); + + CV_CPU_DISPATCH(log32f, (src, dst, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +void log64f(const double *src, double *dst, int n) +{ + CV_INSTRUMENT_REGION() + + CALL_HAL(log64f, cv_hal_log64f, src, dst, n); + CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0); + + CV_CPU_DISPATCH(log64f, (src, dst, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +//============================================================================= +// for compatibility with 3.0 + +void exp(const float* src, float* dst, int n) +{ + exp32f(src, dst, n); +} + +void exp(const double* src, double* dst, int n) +{ + exp64f(src, dst, n); +} + +void log(const float* src, float* dst, int n) +{ + log32f(src, dst, n); +} + +void log(const double* src, double* dst, int n) +{ + log64f(src, dst, n); +} + +void magnitude(const float* x, const float* y, float* dst, int n) +{ + magnitude32f(x, y, dst, n); +} + +void magnitude(const double* x, const double* y, double* dst, int n) +{ + magnitude64f(x, y, dst, n); +} + +void sqrt(const float* src, float* dst, int len) +{ + sqrt32f(src, dst, len); +} + +void sqrt(const double* src, double* dst, int len) +{ + sqrt64f(src, dst, len); +} + +void invSqrt(const float* src, float* dst, int len) +{ + invSqrt32f(src, dst, len); +} + +void invSqrt(const double* src, double* dst, int len) +{ + invSqrt64f(src, dst, len); +} + +}} // namespace cv::hal:: + +float cv::fastAtan2( float y, float x ) +{ + using namespace cv::hal; + CV_CPU_CALL_BASELINE(fastAtan2, (y, x)); +} diff --git a/modules/core/src/mathfuncs_core.cpp b/modules/core/src/mathfuncs_core.simd.hpp similarity index 92% rename from modules/core/src/mathfuncs_core.cpp rename to modules/core/src/mathfuncs_core.simd.hpp index e0cc5b58dd..ad5176e192 100644 --- a/modules/core/src/mathfuncs_core.cpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -1,46 +1,29 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +namespace cv { namespace hal { + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +// forward declarations +void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees); +void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees); +void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees); +void magnitude32f(const float* x, const float* y, float* mag, int len); +void magnitude64f(const double* x, const double* y, double* mag, int len); +void invSqrt32f(const float* src, float* dst, int len); +void invSqrt64f(const double* src, double* dst, int len); +void sqrt32f(const float* src, float* dst, int len); +void sqrt64f(const double* src, double* dst, int len); +void exp32f(const float *src, float *dst, int n); +void exp64f(const double *src, double *dst, int n); +void log32f(const float *src, float *dst, int n); +void log64f(const double *src, double *dst, int n); +float fastAtan2(float y, float x); + + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY using namespace std; @@ -197,23 +180,17 @@ static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angl } // anonymous:: -namespace cv { namespace hal { - ///////////////////////////////////// ATAN2 //////////////////////////////////// void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { CV_INSTRUMENT_REGION() - - CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees); atanImpl(Y, X, angle, len, angleInDegrees); } void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees) { CV_INSTRUMENT_REGION() - - CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees); atanImpl(Y, X, angle, len, angleInDegrees); } @@ -221,7 +198,6 @@ void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { CV_INSTRUMENT_REGION() - fastAtan32f(Y, X, angle, len, angleInDegrees); } @@ -229,9 +205,6 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0); - int i = 0; #if CV_SIMD128 @@ -257,9 +230,6 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0); - int i = 0; #if CV_SIMD128_64F @@ -286,9 +256,6 @@ void invSqrt32f(const float* src, float* dst, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0); - int i = 0; #if CV_SIMD128 @@ -310,9 +277,6 @@ void invSqrt64f(const double* src, double* dst, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0); - int i = 0; #if CV_SSE2 @@ -330,9 +294,6 @@ void sqrt32f(const float* src, float* dst, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0); - int i = 0; #if CV_SIMD128 @@ -354,9 +315,6 @@ void sqrt64f(const double* src, double* dst, int len) { CV_INSTRUMENT_REGION() - CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0); - int i = 0; #if CV_SIMD128_64F @@ -381,9 +339,6 @@ void exp32f(const float *src, float *dst, int n) { CV_INSTRUMENT_REGION() - CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0); - for (int i = 0; i < n; i++) { dst[i] = std::exp(src[i]); @@ -394,9 +349,6 @@ void exp64f(const double *src, double *dst, int n) { CV_INSTRUMENT_REGION() - CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0); - for (int i = 0; i < n; i++) { dst[i] = std::exp(src[i]); @@ -407,9 +359,6 @@ void log32f(const float *src, float *dst, int n) { CV_INSTRUMENT_REGION() - CALL_HAL(log32f, cv_hal_log32f, src, dst, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0); - for (int i = 0; i < n; i++) { dst[i] = std::log(src[i]); @@ -419,9 +368,6 @@ void log64f(const double *src, double *dst, int n) { CV_INSTRUMENT_REGION() - CALL_HAL(log64f, cv_hal_log64f, src, dst, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0); - for (int i = 0; i < n; i++) { dst[i] = std::log(src[i]); @@ -534,9 +480,6 @@ void exp32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION() - CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, _x, y, n) >= 0); - static const float A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), @@ -551,7 +494,90 @@ void exp32f( const float *_x, float *y, int n ) const Cv32suf* x = (const Cv32suf*)_x; Cv32suf buf[4]; -#if CV_SSE2 +#if CV_AVX2 + if( n >= 8 ) + { + static const __m256d prescale4 = _mm256_set1_pd(exp_prescale); + static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale); + static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); + static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); + + static const __m256 mA1 = _mm256_set1_ps(A1); + static const __m256 mA2 = _mm256_set1_ps(A2); + static const __m256 mA3 = _mm256_set1_ps(A3); + static const __m256 mA4 = _mm256_set1_ps(A4); + bool y_aligned = (size_t)(void*)y % 32 == 0; + + ushort CV_DECL_ALIGNED(32) tab_idx[16]; + + for( ; i <= n - 8; i += 8 ) + { + __m128i xi0, xi1; + + __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4)); + __m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4)); + + xd0 = _mm256_mul_pd(xd0, prescale4); + xd1 = _mm256_mul_pd(xd1, prescale4); + + xi0 = _mm256_cvtpd_epi32(xd0); + xi1 = _mm256_cvtpd_epi32(xd1); + + xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0)); + xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1)); + + // gcc does not support _mm256_set_m128 + //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0)); + __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1); + + xf = _mm256_mul_ps(xf, postscale8); + + xi0 = _mm_packs_epi32(xi0, xi1); + + _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); + + xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); + xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); + xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + + __m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]); + __m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]); + + // gcc does not support _mm256_set_m128 + //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0)); + __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1); + + //_mm256_set_m128i(xi1, xi0) + __m256i temp = (__m256i)_mm256_insertf128_ps(_mm256_castps128_ps256((__m128)xi0), (__m128)xi1, 1); + + yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23))); + + __m256 zf = _mm256_add_ps(xf, mA1); + +#if CV_FMA3 + zf = _mm256_fmadd_ps(zf, xf, mA2); + zf = _mm256_fmadd_ps(zf, xf, mA3); + zf = _mm256_fmadd_ps(zf, xf, mA4); +#else + zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2); + zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3); + zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4); +#endif + zf = _mm256_mul_ps(zf, yf); + + if( y_aligned ) + { + _mm256_store_ps(y + i, zf); + } + else + { + _mm256_storeu_ps(y + i, zf); + } + } + } +#elif CV_SSE2 if( n >= 8 ) { static const __m128d prescale2 = _mm_set1_pd(exp_prescale); @@ -738,9 +764,6 @@ void exp64f( const double *_x, double *y, int n ) { CV_INSTRUMENT_REGION() - CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, _x, y, n) >= 0); - static const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, @@ -1187,9 +1210,6 @@ void log32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION() - CALL_HAL(log32f, cv_hal_log32f, _x, y, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, _x, y, n) >= 0); - static const float shift[] = { 0, -1.f/512 }; static const float A0 = 0.3333333333333333333333333f, @@ -1336,9 +1356,6 @@ void log64f( const double *x, double *y, int n ) { CV_INSTRUMENT_REGION() - CALL_HAL(log64f, cv_hal_log64f, x, y, n); - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, x, y, n) >= 0); - static const double shift[] = { 0, -1./512 }; static const double A7 = 1.0, @@ -1524,64 +1541,13 @@ void log64f( const double *x, double *y, int n ) #endif // issue 7795 -//============================================================================= -// for compatibility with 3.0 - -void exp(const float* src, float* dst, int n) -{ - exp32f(src, dst, n); -} - -void exp(const double* src, double* dst, int n) -{ - exp64f(src, dst, n); -} - -void log(const float* src, float* dst, int n) +float fastAtan2( float y, float x ) { - log32f(src, dst, n); -} - -void log(const double* src, double* dst, int n) -{ - log64f(src, dst, n); -} - -void magnitude(const float* x, const float* y, float* dst, int n) -{ - magnitude32f(x, y, dst, n); -} - -void magnitude(const double* x, const double* y, double* dst, int n) -{ - magnitude64f(x, y, dst, n); -} - -void sqrt(const float* src, float* dst, int len) -{ - sqrt32f(src, dst, len); -} - -void sqrt(const double* src, double* dst, int len) -{ - sqrt64f(src, dst, len); -} - -void invSqrt(const float* src, float* dst, int len) -{ - invSqrt32f(src, dst, len); -} - -void invSqrt(const double* src, double* dst, int len) -{ - invSqrt64f(src, dst, len); + return atanImpl(y, x); } +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -} // cv::hal:: -} // cv:: +CV_CPU_OPTIMIZATION_NAMESPACE_END -float cv::fastAtan2( float y, float x ) -{ - return atanImpl(y, x); -} +}} // namespace cv::hal diff --git a/modules/world/CMakeLists.txt b/modules/world/CMakeLists.txt index dde793fa00..1152cd3ef1 100644 --- a/modules/world/CMakeLists.txt +++ b/modules/world/CMakeLists.txt @@ -24,6 +24,7 @@ if(NOT OPENCV_INITIAL_PASS) message(STATUS "Processing WORLD modules...") foreach(m ${OPENCV_MODULES_BUILD}) + set(the_module ${m}) if(OPENCV_MODULE_${m}_IS_PART_OF_WORLD) message(STATUS " module ${m}...") set(CMAKE_CURRENT_SOURCE_DIR "${OPENCV_MODULE_${m}_LOCATION}")