Merge pull request #8441 from alalek:dispatch_mathfuncs_core

pull/8523/head
Alexander Alekhin 8 years ago
commit 297ba85323
  1. 103
      cmake/OpenCVCompilerOptimizations.cmake
  2. 13
      cmake/OpenCVModule.cmake
  3. 10
      cmake/OpenCVPCHSupport.cmake
  4. 3
      modules/core/CMakeLists.txt
  5. 27
      modules/core/include/opencv2/core/cv_cpu_dispatch.h
  6. 87
      modules/core/include/opencv2/core/cv_cpu_helper.h
  7. 11
      modules/core/include/opencv2/core/cvdef.h
  8. 30
      modules/core/include/opencv2/core/hal/intrin.hpp
  9. 8
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  10. 6
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  11. 6
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  12. 10
      modules/core/include/opencv2/core/private.hpp
  13. 30
      modules/core/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp
  14. 215
      modules/core/src/mathfuncs_core.dispatch.cpp
  15. 264
      modules/core/src/mathfuncs_core.simd.hpp
  16. 1
      modules/world/CMakeLists.txt

@ -275,6 +275,11 @@ set(CPU_BASELINE_FLAGS "")
set(CPU_BASELINE_FINAL "")
set(CPU_DISPATCH_FINAL "")
if(CV_DISABLE_OPTIMIZATION)
set(CPU_DISPATCH "")
set(CPU_DISPATCH_REQUIRE "")
endif()
macro(ocv_check_compiler_optimization OPT)
if(NOT DEFINED CPU_${OPT}_SUPPORTED)
if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE)
@ -319,7 +324,7 @@ macro(ocv_check_compiler_optimization OPT)
endmacro()
foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "" FORCE)
set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
if(NOT DEFINED CPU_${OPT}_FORCE)
set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}")
endif()
@ -515,15 +520,27 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
endforeach()
foreach(fname ${${SOURCES_VAR_NAME}})
string(TOLOWER "${fname}" fname_LOWER)
if(fname_LOWER MATCHES "[.]opt_.*[.]cpp$")
if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
message(STATUS "Excluding from source files list: ${fname}")
if(fname_LOWER MATCHES "\\.(.*)\\.cpp$")
string(TOUPPER "${CMAKE_MATCH_1}" OPT_)
if(OPT_ MATCHES "(CUDA.*|DISPATCH.*|OCL)") # don't touch files like filename.cuda.cpp
list(APPEND __result "${fname}")
#continue()
elseif(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
message(STATUS "Excluding from source files list (optimization is disabled): ${fname}")
#continue()
else()
get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS)
if(__definitions)
list(APPEND __definitions "CV_CPU_DISPATCH_MODE=${OPT_}")
else()
set(__definitions "CV_CPU_DISPATCH_MODE=${OPT_}")
endif()
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}")
set(__opt_found 0)
foreach(OPT ${CPU_BASELINE_FINAL})
string(TOLOWER "${OPT}" OPT_LOWER)
if(fname_LOWER MATCHES "_${OPT_LOWER}[.]cpp$")
if(fname_LOWER MATCHES "\\.${OPT_LOWER}\\.cpp$")
#message("${fname} BASELINE-${OPT}")
set(__opt_found 1)
list(APPEND __result "${fname}")
@ -533,11 +550,11 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
foreach(OPT ${CPU_DISPATCH_FINAL})
foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED})
string(TOLOWER "${OPT2}" OPT2_LOWER)
if(fname_LOWER MATCHES "_${OPT2_LOWER}[.]cpp$")
if(fname_LOWER MATCHES "\\.${OPT2_LOWER}\\.cpp$")
list(APPEND __result_${OPT} "${fname}")
math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1")
set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE)
#message("${fname} ${OPT}")
#message("(${CPU_${OPT}_USAGE_COUNT})${fname} ${OPT}")
#message(" ${CPU_DISPATCH_${OPT}_INCLUDED}")
#message(" ${CPU_DISPATCH_DEFINITIONS_${OPT}}")
#message(" ${CPU_DISPATCH_FLAGS_${OPT}}")
@ -573,7 +590,13 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
list(APPEND __result "$<TARGET_OBJECTS:${TARGET_BASE_NAME}_${OPT}>")
else()
foreach(fname ${__result_${OPT}})
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS)
if(__definitions)
list(APPEND __definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
else()
set(__definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
endif()
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}")
set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
endforeach()
list(APPEND __result ${__result_${OPT}})
@ -620,18 +643,25 @@ macro(ocv_compiler_optimization_fill_cpu_config)
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
# define CV_CPU_HAS_SUPPORT_${OPT} 1
# define CV_CPU_CALL_${OPT}(...) return __VA_ARGS__
# define CV_CPU_CALL_${OPT}(fn, args) return (opt_${OPT}::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
# define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
# define CV_CPU_CALL_${OPT}(...) if (CV_CPU_HAS_SUPPORT_${OPT}) return __VA_ARGS__
# define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args)
#else
# define CV_CPU_HAS_SUPPORT_${OPT} 0
# define CV_CPU_CALL_${OPT}(...)
# define CV_CPU_CALL_${OPT}(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_${OPT}(fn, args, mode, ...) CV_CPU_CALL_${OPT}(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
")
endif()
endforeach()
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
")
set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
if(EXISTS "${__file}")
file(READ "${__file}" __content)
@ -644,6 +674,57 @@ macro(ocv_compiler_optimization_fill_cpu_config)
endif()
endmacro()
macro(ocv_add_dispatched_file filename)
if(NOT OPENCV_INITIAL_PASS)
set(__codestr "
#include \"precomp.hpp\"
#include \"${filename}.simd.hpp\"
")
set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${filename}.simd.hpp\"")
set(__dispatch_modes "BASELINE")
set(__optimizations "${ARGN}")
if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
set(__optimizations "")
endif()
foreach(OPT ${__optimizations})
string(TOLOWER "${OPT}" OPT_LOWER)
set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${OPT_LOWER}.cpp")
if(EXISTS "${__file}")
file(READ "${__file}" __content)
endif()
if(__content STREQUAL __codestr)
#message(STATUS "${__file} contains up-to-date content")
else()
file(WRITE "${__file}" "${__codestr}")
endif()
list(APPEND OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED "${__file}")
set(__declarations_str "${__declarations_str}
#define CV_CPU_DISPATCH_MODE ${OPT}
#include \"opencv2/core/private/cv_cpu_include_simd_declarations.hpp\"
")
set(__dispatch_modes "${OPT}, ${__dispatch_modes}")
endforeach()
set(__declarations_str "${__declarations_str}
#define CV_CPU_DISPATCH_MODES_ALL ${__dispatch_modes}
")
set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.simd_declarations.hpp")
if(EXISTS "${__file}")
file(READ "${__file}" __content)
endif()
if(__content STREQUAL __declarations_str)
#message(STATUS "${__file} contains up-to-date content")
else()
file(WRITE "${__file}" "${__declarations_str}")
endif()
endif()
endmacro()
if(CV_DISABLE_OPTIMIZATION OR CV_ICC)
ocv_update(CV_ENABLE_UNROLLED 0)
else()

@ -314,6 +314,7 @@ macro(ocv_glob_modules)
set(OPENCV_INITIAL_PASS OFF)
if(${BUILD_opencv_world})
foreach(m ${OPENCV_MODULES_BUILD})
set(the_module "${m}")
if("${m}" STREQUAL opencv_world)
add_subdirectory("${OPENCV_MODULE_opencv_world_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/world")
elseif(NOT OPENCV_MODULE_${m}_IS_PART_OF_WORLD AND NOT ${m} STREQUAL opencv_world)
@ -329,6 +330,7 @@ macro(ocv_glob_modules)
endforeach()
else()
foreach(m ${OPENCV_MODULES_BUILD})
set(the_module "${m}")
if(m MATCHES "^opencv_")
string(REGEX REPLACE "^opencv_" "" __shortname "${m}")
add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${__shortname}")
@ -646,11 +648,13 @@ macro(ocv_set_module_sources)
ocv_get_module_external_sources()
endif()
if(OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED)
list(APPEND OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED})
endif()
# use full paths for module to be independent from the module location
ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)
ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
endmacro()
@ -766,6 +770,11 @@ macro(ocv_create_module)
endmacro()
macro(_ocv_create_module)
ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
# The condition we ought to be testing here is whether ocv_add_precompiled_headers will
# be called at some point in the future. We can't look into the future, though,
# so this will have to do.

@ -288,11 +288,12 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input)
foreach(src ${_sources})
if(NOT "${src}" MATCHES "\\.mm$")
get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
if(NOT oldProps)
get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS)
if(NOT oldProps AND NOT oldProps2)
set(newProperties "-include \"${CMAKE_CURRENT_BINARY_DIR}/${_name}\"")
set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}")
else()
ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}")
ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}")
endif()
endif()
endforeach()
@ -339,11 +340,12 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
AND NOT "${src}" MATCHES "^\$" # CMake generator expressions
)
get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
if(NOT oldProps)
get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS)
if(NOT oldProps AND NOT oldProps2)
set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"")
set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}")
else()
ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}")
ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}")
endif()
endif()
endforeach()

@ -1,4 +1,7 @@
set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_module(core
"${OPENCV_HAL_LINKER_LIBS}"
OPTIONAL opencv_cudev

@ -7,6 +7,23 @@
#include "cv_cpu_config.h"
#include "cv_cpu_helper.h"
#ifdef CV_CPU_DISPATCH_MODE
#define CV_CPU_OPTIMIZATION_NAMESPACE __CV_CAT(opt_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#endif
#define __CV_CPU_DISPATCH_CHAIN_END(fn, args, mode, ...) /* done */
#define __CV_CPU_DISPATCH(fn, args, mode, ...) __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define __CV_CPU_DISPATCH_EXPAND(fn, args, ...) __CV_EXPAND(__CV_CPU_DISPATCH(fn, args, __VA_ARGS__))
#define CV_CPU_DISPATCH(fn, args, ...) __CV_CPU_DISPATCH_EXPAND(fn, args, __VA_ARGS__, END) // expand macros
#if defined CV_ENABLE_INTRINSICS \
&& !defined CV_DISABLE_OPTIMIZATION \
&& !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
@ -76,6 +93,16 @@
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
struct VZeroUpperGuard {
#ifdef __GNUC__
__attribute__((always_inline))
#endif
inline ~VZeroUpperGuard() { _mm256_zeroupper(); }
};
#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard;
#endif
#endif // __OPENCV_BUILD

@ -2,132 +2,147 @@
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
# define CV_CPU_HAS_SUPPORT_SSE 1
# define CV_CPU_CALL_SSE(...) return __VA_ARGS__
# define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
# define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
# define CV_CPU_CALL_SSE(...) if (CV_CPU_HAS_SUPPORT_SSE) return __VA_ARGS__
# define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSE 0
# define CV_CPU_CALL_SSE(...)
# define CV_CPU_CALL_SSE(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...) CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
# define CV_CPU_HAS_SUPPORT_SSE2 1
# define CV_CPU_CALL_SSE2(...) return __VA_ARGS__
# define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
# define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
# define CV_CPU_CALL_SSE2(...) if (CV_CPU_HAS_SUPPORT_SSE2) return __VA_ARGS__
# define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSE2 0
# define CV_CPU_CALL_SSE2(...)
# define CV_CPU_CALL_SSE2(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...) CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
# define CV_CPU_HAS_SUPPORT_SSE3 1
# define CV_CPU_CALL_SSE3(...) return __VA_ARGS__
# define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
# define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
# define CV_CPU_CALL_SSE3(...) if (CV_CPU_HAS_SUPPORT_SSE3) return __VA_ARGS__
# define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSE3 0
# define CV_CPU_CALL_SSE3(...)
# define CV_CPU_CALL_SSE3(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...) CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
# define CV_CPU_HAS_SUPPORT_SSSE3 1
# define CV_CPU_CALL_SSSE3(...) return __VA_ARGS__
# define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
# define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
# define CV_CPU_CALL_SSSE3(...) if (CV_CPU_HAS_SUPPORT_SSSE3) return __VA_ARGS__
# define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSSE3 0
# define CV_CPU_CALL_SSSE3(...)
# define CV_CPU_CALL_SSSE3(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...) CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
# define CV_CPU_HAS_SUPPORT_SSE4_1 1
# define CV_CPU_CALL_SSE4_1(...) return __VA_ARGS__
# define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
# define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
# define CV_CPU_CALL_SSE4_1(...) if (CV_CPU_HAS_SUPPORT_SSE4_1) return __VA_ARGS__
# define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSE4_1 0
# define CV_CPU_CALL_SSE4_1(...)
# define CV_CPU_CALL_SSE4_1(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...) CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
# define CV_CPU_HAS_SUPPORT_SSE4_2 1
# define CV_CPU_CALL_SSE4_2(...) return __VA_ARGS__
# define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
# define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
# define CV_CPU_CALL_SSE4_2(...) if (CV_CPU_HAS_SUPPORT_SSE4_2) return __VA_ARGS__
# define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
#else
# define CV_CPU_HAS_SUPPORT_SSE4_2 0
# define CV_CPU_CALL_SSE4_2(...)
# define CV_CPU_CALL_SSE4_2(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...) CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
# define CV_CPU_HAS_SUPPORT_POPCNT 1
# define CV_CPU_CALL_POPCNT(...) return __VA_ARGS__
# define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
# define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
# define CV_CPU_CALL_POPCNT(...) if (CV_CPU_HAS_SUPPORT_POPCNT) return __VA_ARGS__
# define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
#else
# define CV_CPU_HAS_SUPPORT_POPCNT 0
# define CV_CPU_CALL_POPCNT(...)
# define CV_CPU_CALL_POPCNT(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...) CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
# define CV_CPU_HAS_SUPPORT_AVX 1
# define CV_CPU_CALL_AVX(...) return __VA_ARGS__
# define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
# define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
# define CV_CPU_CALL_AVX(...) if (CV_CPU_HAS_SUPPORT_AVX) return __VA_ARGS__
# define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
#else
# define CV_CPU_HAS_SUPPORT_AVX 0
# define CV_CPU_CALL_AVX(...)
# define CV_CPU_CALL_AVX(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...) CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
# define CV_CPU_HAS_SUPPORT_FP16 1
# define CV_CPU_CALL_FP16(...) return __VA_ARGS__
# define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
# define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
# define CV_CPU_CALL_FP16(...) if (CV_CPU_HAS_SUPPORT_FP16) return __VA_ARGS__
# define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
#else
# define CV_CPU_HAS_SUPPORT_FP16 0
# define CV_CPU_CALL_FP16(...)
# define CV_CPU_CALL_FP16(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...) CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
# define CV_CPU_HAS_SUPPORT_AVX2 1
# define CV_CPU_CALL_AVX2(...) return __VA_ARGS__
# define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
# define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
# define CV_CPU_CALL_AVX2(...) if (CV_CPU_HAS_SUPPORT_AVX2) return __VA_ARGS__
# define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
#else
# define CV_CPU_HAS_SUPPORT_AVX2 0
# define CV_CPU_CALL_AVX2(...)
# define CV_CPU_CALL_AVX2(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...) CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
# define CV_CPU_HAS_SUPPORT_FMA3 1
# define CV_CPU_CALL_FMA3(...) return __VA_ARGS__
# define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
# define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
# define CV_CPU_CALL_FMA3(...) if (CV_CPU_HAS_SUPPORT_FMA3) return __VA_ARGS__
# define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
#else
# define CV_CPU_HAS_SUPPORT_FMA3 0
# define CV_CPU_CALL_FMA3(...)
# define CV_CPU_CALL_FMA3(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
# define CV_CPU_HAS_SUPPORT_NEON 1
# define CV_CPU_CALL_NEON(...) return __VA_ARGS__
# define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
# define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
# define CV_CPU_CALL_NEON(...) if (CV_CPU_HAS_SUPPORT_NEON) return __VA_ARGS__
# define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
#else
# define CV_CPU_HAS_SUPPORT_NEON 0
# define CV_CPU_CALL_NEON(...)
# define CV_CPU_CALL_NEON(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */

@ -52,6 +52,17 @@
#include "cvconfig.h"
#endif
#ifndef __CV_EXPAND
#define __CV_EXPAND(x) x
#endif
#ifndef __CV_CAT
#define __CV_CAT__(x, y) x ## y
#define __CV_CAT_(x, y) __CV_CAT__(x, y)
#define __CV_CAT(x, y) __CV_CAT_(x, y)
#endif
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
#endif

@ -60,6 +60,25 @@
// access from within opencv code more accessible
namespace cv {
#ifndef CV_DOXYGEN
#ifdef CV_CPU_DISPATCH_MODE
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#endif
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
//! @addtogroup core_hal_intrin
//! @{
@ -281,6 +300,9 @@ template <typename T> struct V_SIMD128Traits
//! @}
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
}
#ifdef CV_DOXYGEN
@ -323,6 +345,10 @@ template <typename T> struct V_SIMD128Traits
namespace cv {
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
template <typename R> struct V_RegTrait128;
template <> struct V_RegTrait128<uchar> {
@ -407,6 +433,10 @@ template <> struct V_RegTrait128<double> {
};
#endif
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} // cv::
//! @endcond

@ -53,6 +53,10 @@
namespace cv
{
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
/** @addtogroup core_hal_intrin
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
@ -1827,7 +1831,9 @@ static inline bool hasSIMD128()
//! @}
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
}
#endif

@ -53,6 +53,8 @@ namespace cv
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128 1
#if defined(__aarch64__)
#define CV_SIMD128_64F 1
@ -1238,11 +1240,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128()
{
return checkHardwareSupport(CV_CPU_NEON);
return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}

@ -56,6 +56,8 @@ namespace cv
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
struct v_uint8x16
{
typedef uchar lane_type;
@ -1791,11 +1793,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128()
{
return checkHardwareSupport(CV_CPU_SSE2);
return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}

@ -540,7 +540,7 @@ CV_EXPORTS InstrNode* getCurrentNode();
///// General instrumentation
// General OpenCV region instrumentation macro
#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN)
#define CV_INSTRUMENT_REGION_() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN)
// Custom OpenCV region instrumentation macro
#define CV_INSTRUMENT_REGION_NAME(NAME) CV_INSTRUMENT_REGION_CUSTOM_META(NAME, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN)
// Instrumentation for parallel_for_ or other regions which forks and gathers threads
@ -566,7 +566,7 @@ CV_EXPORTS InstrNode* getCurrentNode();
#else
#define CV_INSTRUMENT_REGION_META(...)
#define CV_INSTRUMENT_REGION()
#define CV_INSTRUMENT_REGION_()
#define CV_INSTRUMENT_REGION_NAME(...)
#define CV_INSTRUMENT_REGION_MT_FORK()
@ -580,6 +580,12 @@ CV_EXPORTS InstrNode* getCurrentNode();
#define CV_INSTRUMENT_MARK_OPENCL(...)
#endif
#ifdef __CV_AVX_GUARD
#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_()
#else
#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_()
#endif
//! @endcond
#endif // OPENCV_CORE_PRIVATE_HPP

@ -0,0 +1,30 @@
// Helper file to include dispatched functions declaration:
//
// Usage:
// #define CV_CPU_SIMD_FILENAME "<filename>.simd.hpp"
// #define CV_CPU_DISPATCH_MODE AVX2
// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
// #define CV_CPU_DISPATCH_MODE SSE2
// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
#ifndef CV_DISABLE_OPTIMIZATION
#ifdef _MSC_VER
#pragma warning(disable: 4702) // unreachable code
#endif
#endif
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#endif
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#include CV_CPU_SIMD_FILENAME
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
#undef CV_CPU_DISPATCH_MODE

@ -0,0 +1,215 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp"
#include "mathfuncs_core.simd.hpp"
#include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace hal {
///////////////////////////////////// ATAN2 ////////////////////////////////////
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
CV_CPU_DISPATCH(fastAtan32f, (Y, X, angle, len, angleInDegrees),
CV_CPU_DISPATCH_MODES_ALL);
}
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
CV_CPU_DISPATCH(fastAtan64f, (Y, X, angle, len, angleInDegrees),
CV_CPU_DISPATCH_MODES_ALL);
}
// deprecated
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
fastAtan32f(Y, X, angle, len, angleInDegrees);
}
void magnitude32f(const float* x, const float* y, float* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void magnitude64f(const double* x, const double* y, double* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
CV_CPU_DISPATCH(magnitude64f, (x, y, mag, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void invSqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
CV_CPU_DISPATCH(invSqrt32f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void invSqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
CV_CPU_DISPATCH(invSqrt64f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void sqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
CV_CPU_DISPATCH(sqrt32f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void sqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
CV_CPU_DISPATCH(sqrt64f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void exp32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
CV_CPU_DISPATCH(exp32f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void exp64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
CV_CPU_DISPATCH(exp64f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void log32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
CV_CPU_DISPATCH(log32f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void log64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
CV_CPU_DISPATCH(log64f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
//=============================================================================
// for compatibility with 3.0
void exp(const float* src, float* dst, int n)
{
exp32f(src, dst, n);
}
void exp(const double* src, double* dst, int n)
{
exp64f(src, dst, n);
}
void log(const float* src, float* dst, int n)
{
log32f(src, dst, n);
}
void log(const double* src, double* dst, int n)
{
log64f(src, dst, n);
}
void magnitude(const float* x, const float* y, float* dst, int n)
{
magnitude32f(x, y, dst, n);
}
void magnitude(const double* x, const double* y, double* dst, int n)
{
magnitude64f(x, y, dst, n);
}
void sqrt(const float* src, float* dst, int len)
{
sqrt32f(src, dst, len);
}
void sqrt(const double* src, double* dst, int len)
{
sqrt64f(src, dst, len);
}
void invSqrt(const float* src, float* dst, int len)
{
invSqrt32f(src, dst, len);
}
void invSqrt(const double* src, double* dst, int len)
{
invSqrt64f(src, dst, len);
}
}} // namespace cv::hal::
float cv::fastAtan2( float y, float x )
{
using namespace cv::hal;
CV_CPU_CALL_BASELINE(fastAtan2, (y, x));
}

@ -1,46 +1,29 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
namespace cv { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
// forward declarations
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
void magnitude32f(const float* x, const float* y, float* mag, int len);
void magnitude64f(const double* x, const double* y, double* mag, int len);
void invSqrt32f(const float* src, float* dst, int len);
void invSqrt64f(const double* src, double* dst, int len);
void sqrt32f(const float* src, float* dst, int len);
void sqrt64f(const double* src, double* dst, int len);
void exp32f(const float *src, float *dst, int n);
void exp64f(const double *src, double *dst, int n);
void log32f(const float *src, float *dst, int n);
void log64f(const double *src, double *dst, int n);
float fastAtan2(float y, float x);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
using namespace std;
@ -197,23 +180,17 @@ static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angl
} // anonymous::
namespace cv { namespace hal {
///////////////////////////////////// ATAN2 ////////////////////////////////////
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
atanImpl<float>(Y, X, angle, len, angleInDegrees);
}
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
atanImpl<double>(Y, X, angle, len, angleInDegrees);
}
@ -221,7 +198,6 @@ void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
fastAtan32f(Y, X, angle, len, angleInDegrees);
}
@ -229,9 +205,6 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
int i = 0;
#if CV_SIMD128
@ -257,9 +230,6 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
int i = 0;
#if CV_SIMD128_64F
@ -286,9 +256,6 @@ void invSqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
int i = 0;
#if CV_SIMD128
@ -310,9 +277,6 @@ void invSqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
int i = 0;
#if CV_SSE2
@ -330,9 +294,6 @@ void sqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
int i = 0;
#if CV_SIMD128
@ -354,9 +315,6 @@ void sqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
int i = 0;
#if CV_SIMD128_64F
@ -381,9 +339,6 @@ void exp32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
for (int i = 0; i < n; i++)
{
dst[i] = std::exp(src[i]);
@ -394,9 +349,6 @@ void exp64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
for (int i = 0; i < n; i++)
{
dst[i] = std::exp(src[i]);
@ -407,9 +359,6 @@ void log32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
for (int i = 0; i < n; i++)
{
dst[i] = std::log(src[i]);
@ -419,9 +368,6 @@ void log64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
for (int i = 0; i < n; i++)
{
dst[i] = std::log(src[i]);
@ -534,9 +480,6 @@ void exp32f( const float *_x, float *y, int n )
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, _x, y, n) >= 0);
static const float
A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
@ -551,7 +494,90 @@ void exp32f( const float *_x, float *y, int n )
const Cv32suf* x = (const Cv32suf*)_x;
Cv32suf buf[4];
#if CV_SSE2
#if CV_AVX2
if( n >= 8 )
{
static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
static const __m256 mA1 = _mm256_set1_ps(A1);
static const __m256 mA2 = _mm256_set1_ps(A2);
static const __m256 mA3 = _mm256_set1_ps(A3);
static const __m256 mA4 = _mm256_set1_ps(A4);
bool y_aligned = (size_t)(void*)y % 32 == 0;
ushort CV_DECL_ALIGNED(32) tab_idx[16];
for( ; i <= n - 8; i += 8 )
{
__m128i xi0, xi1;
__m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
__m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
xd0 = _mm256_mul_pd(xd0, prescale4);
xd1 = _mm256_mul_pd(xd1, prescale4);
xi0 = _mm256_cvtpd_epi32(xd0);
xi1 = _mm256_cvtpd_epi32(xd1);
xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
// gcc does not support _mm256_set_m128
//xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
__m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
xf = _mm256_mul_ps(xf, postscale8);
xi0 = _mm_packs_epi32(xi0, xi1);
_mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
__m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
__m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
// gcc does not support _mm256_set_m128
//__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
__m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
//_mm256_set_m128i(xi1, xi0)
__m256i temp = (__m256i)_mm256_insertf128_ps(_mm256_castps128_ps256((__m128)xi0), (__m128)xi1, 1);
yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
__m256 zf = _mm256_add_ps(xf, mA1);
#if CV_FMA3
zf = _mm256_fmadd_ps(zf, xf, mA2);
zf = _mm256_fmadd_ps(zf, xf, mA3);
zf = _mm256_fmadd_ps(zf, xf, mA4);
#else
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
#endif
zf = _mm256_mul_ps(zf, yf);
if( y_aligned )
{
_mm256_store_ps(y + i, zf);
}
else
{
_mm256_storeu_ps(y + i, zf);
}
}
}
#elif CV_SSE2
if( n >= 8 )
{
static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
@ -738,9 +764,6 @@ void exp64f( const double *_x, double *y, int n )
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, _x, y, n) >= 0);
static const double
A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
@ -1187,9 +1210,6 @@ void log32f( const float *_x, float *y, int n )
{
CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, _x, y, n) >= 0);
static const float shift[] = { 0, -1.f/512 };
static const float
A0 = 0.3333333333333333333333333f,
@ -1336,9 +1356,6 @@ void log64f( const double *x, double *y, int n )
{
CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, x, y, n) >= 0);
static const double shift[] = { 0, -1./512 };
static const double
A7 = 1.0,
@ -1524,64 +1541,13 @@ void log64f( const double *x, double *y, int n )
#endif // issue 7795
//=============================================================================
// for compatibility with 3.0
void exp(const float* src, float* dst, int n)
{
exp32f(src, dst, n);
}
void exp(const double* src, double* dst, int n)
{
exp64f(src, dst, n);
}
void log(const float* src, float* dst, int n)
float fastAtan2( float y, float x )
{
log32f(src, dst, n);
}
void log(const double* src, double* dst, int n)
{
log64f(src, dst, n);
}
void magnitude(const float* x, const float* y, float* dst, int n)
{
magnitude32f(x, y, dst, n);
}
void magnitude(const double* x, const double* y, double* dst, int n)
{
magnitude64f(x, y, dst, n);
}
void sqrt(const float* src, float* dst, int len)
{
sqrt32f(src, dst, len);
}
void sqrt(const double* src, double* dst, int len)
{
sqrt64f(src, dst, len);
}
void invSqrt(const float* src, float* dst, int len)
{
invSqrt32f(src, dst, len);
}
void invSqrt(const double* src, double* dst, int len)
{
invSqrt64f(src, dst, len);
return atanImpl<float>(y, x);
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
} // cv::hal::
} // cv::
CV_CPU_OPTIMIZATION_NAMESPACE_END
float cv::fastAtan2( float y, float x )
{
return atanImpl<float>(y, x);
}
}} // namespace cv::hal

@ -24,6 +24,7 @@ if(NOT OPENCV_INITIAL_PASS)
message(STATUS "Processing WORLD modules...")
foreach(m ${OPENCV_MODULES_BUILD})
set(the_module ${m})
if(OPENCV_MODULE_${m}_IS_PART_OF_WORLD)
message(STATUS " module ${m}...")
set(CMAKE_CURRENT_SOURCE_DIR "${OPENCV_MODULE_${m}_LOCATION}")

Loading…
Cancel
Save