Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/12118/head
Alexander Alekhin 7 years ago
commit 82c477c9f7
  1. 20
      CMakeLists.txt
  2. 11
      cmake/OpenCVCompilerOptimizations.cmake
  3. 124
      cmake/OpenCVDetectInferenceEngine.cmake
  4. 6
      cmake/OpenCVModule.cmake
  5. 14
      doc/opencv.bib
  6. BIN
      doc/tutorials/imgproc/out_of_focus_deblur_filter/images/original.jpg
  7. BIN
      doc/tutorials/imgproc/out_of_focus_deblur_filter/images/psf.png
  8. BIN
      doc/tutorials/imgproc/out_of_focus_deblur_filter/images/recovered.jpg
  9. 112
      doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
  10. 10
      doc/tutorials/imgproc/table_of_content_imgproc.markdown
  11. 21
      modules/core/include/opencv2/core/hal/intrin.hpp
  12. 391
      modules/core/include/opencv2/core/hal/intrin_avx.hpp
  13. 23
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  14. 30
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  15. 486
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  16. 13
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  17. 12
      modules/core/include/opencv2/core/utility.hpp
  18. 3
      modules/core/src/arithm.cpp
  19. 3
      modules/core/src/copy.cpp
  20. 58
      modules/core/src/mathfuncs_core.simd.hpp
  21. 13
      modules/core/src/matrix.cpp
  22. 4
      modules/core/src/mean.cpp
  23. 71
      modules/core/src/merge.cpp
  24. 15
      modules/core/src/ocl.cpp
  25. 4
      modules/core/src/rand.cpp
  26. 69
      modules/core/src/split.cpp
  27. 21
      modules/core/src/system.cpp
  28. 14
      modules/core/src/umatrix.cpp
  29. 10
      modules/core/test/test_arithm.cpp
  30. 119
      modules/core/test/test_concatenation.cpp
  31. 5
      modules/core/test/test_intrin.avx2.cpp
  32. 298
      modules/core/test/test_intrin.cpp
  33. 2
      modules/core/test/test_intrin.fp16.cpp
  34. 296
      modules/core/test/test_intrin.simd.hpp
  35. 169
      modules/core/test/test_intrin_utils.hpp
  36. 1
      modules/core/test/test_rand.cpp
  37. 16
      modules/dnn/CMakeLists.txt
  38. 2
      modules/dnn/include/opencv2/dnn/dnn.hpp
  39. 26
      modules/dnn/include/opencv2/dnn/shape_utils.hpp
  40. 131
      modules/dnn/src/dnn.cpp
  41. 6
      modules/dnn/src/layers/detection_output_layer.cpp
  42. 63
      modules/dnn/src/layers/pooling_layer.cpp
  43. 55
      modules/dnn/src/layers/proposal_layer.cpp
  44. 3
      modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
  45. 20
      modules/dnn/src/op_inf_engine.cpp
  46. 20
      modules/dnn/src/op_inf_engine.hpp
  47. 4
      modules/dnn/src/opencl/ocl4dnn_pooling.cl
  48. 8
      modules/dnn/src/tensorflow/tf_importer.cpp
  49. 10
      modules/dnn/src/torch/torch_importer.cpp
  50. 9
      modules/dnn/test/test_backends.cpp
  51. 99
      modules/dnn/test/test_caffe_importer.cpp
  52. 74
      modules/dnn/test/test_halide_layers.cpp
  53. 10
      modules/dnn/test/test_layers.cpp
  54. 7
      modules/dnn/test/test_tf_importer.cpp
  55. 159
      modules/dnn/test/test_torch_importer.cpp
  56. 14
      modules/highgui/src/window_w32.cpp
  57. 2
      modules/imgproc/include/opencv2/imgproc.hpp
  58. 8
      modules/python/bindings/CMakeLists.txt
  59. 28
      modules/ts/include/opencv2/ts.hpp
  60. 12
      modules/ts/include/opencv2/ts/ts_perf.hpp
  61. 26
      modules/ts/src/ocl_test.cpp
  62. 88
      modules/ts/src/ts.cpp
  63. 137
      modules/ts/src/ts_func.cpp
  64. 2
      modules/videoio/include/opencv2/videoio.hpp
  65. 37
      modules/videoio/src/cap_dshow.cpp
  66. 33
      modules/videoio/src/cap_gstreamer.cpp
  67. 16
      modules/videoio/src/cap_msmf.cpp
  68. 2
      modules/videoio/src/cap_vfw.cpp
  69. 122
      samples/cpp/create_mask.cpp
  70. 149
      samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
  71. 2
      samples/dnn/object_detection.py

@ -1402,15 +1402,19 @@ if(WITH_HALIDE OR HAVE_HALIDE)
status(" Halide:" HAVE_HALIDE THEN "YES (${HALIDE_LIBRARIES} ${HALIDE_INCLUDE_DIRS})" ELSE NO) status(" Halide:" HAVE_HALIDE THEN "YES (${HALIDE_LIBRARIES} ${HALIDE_INCLUDE_DIRS})" ELSE NO)
endif() endif()
if(WITH_INF_ENGINE OR HAVE_INF_ENGINE) if(WITH_INF_ENGINE OR INF_ENGINE_TARGET)
if(HAVE_INF_ENGINE) if(INF_ENGINE_TARGET)
set(__msg "YES") set(__msg "YES (${INF_ENGINE_RELEASE} / ${INF_ENGINE_VERSION})")
if(DEFINED INF_ENGINE_VERSION) get_target_property(_lib ${INF_ENGINE_TARGET} IMPORTED_LOCATION)
set(__msg "YES (ver ${INF_ENGINE_VERSION})") if(NOT _lib)
endif() get_target_property(_lib_rel ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_RELEASE)
get_target_property(_lib_dbg ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_DEBUG)
set(_lib "${_lib_rel} / ${_lib_dbg}")
endif()
get_target_property(_inc ${INF_ENGINE_TARGET} INTERFACE_INCLUDE_DIRECTORIES)
status(" Inference Engine:" "${__msg}") status(" Inference Engine:" "${__msg}")
status(" libs:" "${INF_ENGINE_LIBRARIES}") status(" libs:" "${_lib}")
status(" includes:" "${INF_ENGINE_INCLUDE_DIRS}") status(" includes:" "${_inc}")
else() else()
status(" Inference Engine:" "NO") status(" Inference Engine:" "NO")
endif() endif()

@ -700,12 +700,21 @@ macro(ocv_compiler_optimization_fill_cpu_config)
list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT}) list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT})
endforeach() endforeach()
list(REMOVE_DUPLICATES __dispatch_modes) list(REMOVE_DUPLICATES __dispatch_modes)
set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "")
foreach(OPT ${__dispatch_modes}) foreach(OPT ${__dispatch_modes})
set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE} set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
#define CV_CPU_DISPATCH_COMPILE_${OPT} 1") #define CV_CPU_DISPATCH_COMPILE_${OPT} 1")
endforeach() endforeach()
set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
\n\n#define CV_CPU_DISPATCH_FEATURES 0 \\")
foreach(OPT ${__dispatch_modes})
if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
, CV_CPU_${OPT} \\")
endif()
endforeach()
set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}\n")
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n") set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n")
foreach(OPT ${CPU_ALL_OPTIMIZATIONS}) foreach(OPT ${CPU_ALL_OPTIMIZATIONS})
if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x") if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")

@ -1,79 +1,87 @@
# The script detects Intel(R) Inference Engine installation # The script detects Intel(R) Inference Engine installation
# #
# Parameters: # Cache variables:
# INTEL_CVSDK_DIR - Path to Inference Engine root folder # INF_ENGINE_OMP_DIR - directory with OpenMP library to link with (needed by some versions of IE)
# IE_PLUGINS_PATH - Path to folder with Inference Engine plugins # INF_ENGINE_RELEASE - a number reflecting IE source interface (linked with OpenVINO release)
# #
# On return this will define: # Detect parameters:
# 1. Native cmake IE package:
# - enironment variable InferenceEngine_DIR is set to location of cmake module
# 2. Custom location:
# - INF_ENGINE_INCLUDE_DIRS - headers search location
# - INF_ENGINE_LIB_DIRS - library search location
# 3. OpenVINO location:
# - environment variable INTEL_CVSDK_DIR is set to location of OpenVINO installation dir
# - INF_ENGINE_PLATFORM - part of name of library directory representing its platform (default ubuntu_16.04)
# #
# HAVE_INF_ENGINE - True if Intel Inference Engine was found # Result:
# INF_ENGINE_INCLUDE_DIRS - Inference Engine include folder # INF_ENGINE_TARGET - set to name of imported library target representing InferenceEngine
# INF_ENGINE_LIBRARIES - Inference Engine libraries and it's dependencies
# #
macro(ie_fail)
set(HAVE_INF_ENGINE FALSE)
return()
endmacro()
find_package(InferenceEngine QUIET) if(NOT HAVE_CXX11)
if(InferenceEngine_FOUND) message(WARNING "DL Inference engine requires C++11. You can turn it on via ENABLE_CXX11=ON CMake flag.")
set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
set(HAVE_INF_ENGINE TRUE)
return() return()
endif() endif()
ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH) # =======================
if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp") function(add_custom_ie_build _inc _lib _lib_rel _lib_dbg _msg)
set(ie_root_paths "${INF_ENGINE_ROOT_DIR}") if(NOT _inc OR NOT (_lib OR _lib_rel OR _lib_dbg))
if(DEFINED INTEL_CVSDK_DIR) return()
list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
endif() endif()
add_library(inference_engine UNKNOWN IMPORTED)
if(NOT ie_root_paths) set_target_properties(inference_engine PROPERTIES
list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/") IMPORTED_LOCATION "${_lib}"
IMPORTED_IMPLIB_RELEASE "${_lib_rel}"
IMPORTED_IMPLIB_DEBUG "${_lib_dbg}"
INTERFACE_INCLUDE_DIRECTORIES "${_inc}"
)
find_library(omp_lib iomp5 PATHS "${INF_ENGINE_OMP_DIR}" NO_DEFAULT_PATH)
if(NOT omp_lib)
message(WARNING "OpenMP for IE have not been found. Set INF_ENGINE_OMP_DIR variable if you experience build errors.")
else()
set_target_properties(inference_engine PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${omp_lib}")
endif() endif()
set(INF_ENGINE_VERSION "Unknown" CACHE STRING "")
set(INF_ENGINE_TARGET inference_engine PARENT_SCOPE)
message(STATUS "Detected InferenceEngine: ${_msg}")
endfunction()
find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths}) # ======================
if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
unset(INF_ENGINE_ROOT_DIR CACHE)
endif()
endif()
set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory") find_package(InferenceEngine QUIET)
if(InferenceEngine_FOUND)
if(NOT INF_ENGINE_ROOT_DIR set(INF_ENGINE_TARGET IE::inference_engine)
OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}" set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}" CACHE STRING "")
OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp" message(STATUS "Detected InferenceEngine: cmake package")
)
message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
ie_fail()
endif() endif()
set(INF_ENGINE_LIBRARIES "") if(NOT INF_ENGINE_TARGET AND INF_ENGINE_LIB_DIRS AND INF_ENGINE_INCLUDE_DIRS)
find_path(ie_custom_inc "inference_engine.hpp" PATHS "${INF_ENGINE_INCLUDE_DIRS}" NO_DEFAULT_PATH)
set(ie_lib_list inference_engine) find_library(ie_custom_lib "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}" NO_DEFAULT_PATH)
find_library(ie_custom_lib_rel "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Release" NO_DEFAULT_PATH)
find_library(ie_custom_lib_dbg "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Debug" NO_DEFAULT_PATH)
add_custom_ie_build("${ie_custom_inc}" "${ie_custom_lib}" "${ie_custom_lib_rel}" "${ie_custom_lib_dbg}" "INF_ENGINE_{INCLUDE,LIB}_DIRS")
endif()
if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}") set(_loc "$ENV{INTEL_CVSDK_DIR}")
set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}") if(NOT INF_ENGINE_TARGET AND _loc)
set(INF_ENGINE_PLATFORM "ubuntu_16.04" CACHE STRING "InferenceEngine platform (library dir)")
find_path(ie_custom_env_inc "inference_engine.hpp" PATHS "${_loc}/deployment_tools/inference_engine/include" NO_DEFAULT_PATH)
find_library(ie_custom_env_lib "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/${INF_ENGINE_PLATFORM}/intel64" NO_DEFAULT_PATH)
find_library(ie_custom_env_lib_rel "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Release" NO_DEFAULT_PATH)
find_library(ie_custom_env_lib_dbg "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Debug" NO_DEFAULT_PATH)
add_custom_ie_build("${ie_custom_env_inc}" "${ie_custom_env_lib}" "${ie_custom_env_lib_rel}" "${ie_custom_env_lib_dbg}" "OpenVINO (${_loc})")
endif() endif()
link_directories( # Add more features to the target
${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
)
foreach(lib ${ie_lib_list}) if(INF_ENGINE_TARGET)
find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH}) if(NOT INF_ENGINE_RELEASE)
if(NOT ${lib}) message(WARNING "InferenceEngine version have not been set, 2018R2 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
message(WARNING "DL IE: Can't find library: '${lib}'")
ie_fail()
endif() endif()
list(APPEND INF_ENGINE_LIBRARIES ${${lib}}) set(INF_ENGINE_RELEASE "2018020000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2018R2.0.2 -> 2018020002)")
endforeach() set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
set(HAVE_INF_ENGINE TRUE) )
endif()

@ -1132,7 +1132,7 @@ function(ocv_add_perf_tests)
source_group("Src" FILES "${${the_target}_pch}") source_group("Src" FILES "${${the_target}_pch}")
ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch}) ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch})
ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}") ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}")
ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS}) ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_PERF_${the_module}_DEPS})
add_dependencies(opencv_perf_tests ${the_target}) add_dependencies(opencv_perf_tests ${the_target})
set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};PerfTest") set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};PerfTest")
@ -1175,7 +1175,7 @@ function(ocv_add_perf_tests)
endfunction() endfunction()
# this is a command for adding OpenCV accuracy/regression tests to the module # this is a command for adding OpenCV accuracy/regression tests to the module
# ocv_add_accuracy_tests([FILES <source group name> <list of sources>] [DEPENDS_ON] <list of extra dependencies>) # ocv_add_accuracy_tests(<list of extra dependencies>)
function(ocv_add_accuracy_tests) function(ocv_add_accuracy_tests)
ocv_debug_message("ocv_add_accuracy_tests(" ${ARGN} ")") ocv_debug_message("ocv_add_accuracy_tests(" ${ARGN} ")")
@ -1211,7 +1211,7 @@ function(ocv_add_accuracy_tests)
source_group("Src" FILES "${${the_target}_pch}") source_group("Src" FILES "${${the_target}_pch}")
ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch}) ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
ocv_target_include_modules(${the_target} ${test_deps} "${test_path}") ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS}) ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_TEST_${the_module}_DEPS})
add_dependencies(opencv_tests ${the_target}) add_dependencies(opencv_tests ${the_target})
set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};AccuracyTest") set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};AccuracyTest")

@ -1016,3 +1016,17 @@
year = {2017}, year = {2017},
organization = {IEEE} organization = {IEEE}
} }
@ARTICLE{gonzalez,
title={Digital Image Fundamentals, Digital Imaging Processing},
author={Gonzalez, Rafael C and others},
year={1987},
publisher={Addison Wesley Publishing Company}
}
@ARTICLE{gruzman,
title={Цифровая обработка изображений в информационных системах},
author={Грузман, И.С. and Киричук, В.С. and Косых, В.П. and Перетягин, Г.И. and Спектор, А.А.},
year={2000},
publisher={Изд-во НГТУ Новосибирск}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 630 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

@ -0,0 +1,112 @@
Out-of-focus Deblur Filter {#tutorial_out_of_focus_deblur_filter}
==========================
Goal
----
In this tutorial you will learn:
- what is a degradation image model
- what is PSF of out-of-focus image
- how to restore a blurred image
- what is Wiener filter
Theory
------
@note The explanation is based on the books @cite gonzalez and @cite gruzman. Also, you can refer to Matlab's tutorial [Image Deblurring in Matlab] and an article [SmartDeblur].
@note An out-of-focus image on this page is a real world image. An out-of-focus was done manually by camera optics.
### What is a degradation image model?
A mathematical model of the image degradation in frequency domain representation is:
\f[S = H\cdot U + N\f]
where
\f$S\f$ is a spectrum of blurred (degraded) image,
\f$U\f$ is a spectrum of original true (undegraded) image,
\f$H\f$ is frequency response of point spread function (PSF),
\f$N\f$ is a spectrum of additive noise.
Circular PSF is a good approximation of out-of-focus distortion. Such PSF is specified by only one parameter - radius \f$R\f$. Circular PSF is used in this work.
![Circular point spread function](psf.png)
### How to restore an blurred image?
The objective of restoration (deblurring) is to obtain an estimate of the original image. Restoration formula in frequency domain is:
\f[U' = H_w\cdot S\f]
where
\f$U'\f$ is spectrum of estimation of original image \f$U\f$,
\f$H_w\f$ is restoration filter, for example, Wiener filter.
### What is Wiener filter?
Wiener filter is a way to restore a blurred image. Let's suppose that PSF is a real and symmetric signal, a power spectrum of the original true image and noise are not known,
then simplified Wiener formula is:
\f[H_w = \frac{H}{|H|^2+\frac{1}{SNR}} \f]
where
\f$SNR\f$ is signal-to-noise ratio.
So, in order to recover an out-of-focus image by Wiener filter, it needs to know \f$SNR\f$ and \f$R\f$ of circular PSF.
Source code
-----------
You can find source code in the `samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp` of the OpenCV source code library.
@include cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
Explanation
-----------
An out-of-focus image recovering algorithm consists of PSF generation, Wiener filter generation and filtering an blurred image in frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp main
A function calcPSF() forms an circular PSF according to input parameter radius \f$R\f$:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcPSF
A function calcWnrFilter() synthesizes simplified Wiener filter \f$H_w\f$ according to formula described above:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcWnrFilter
A function fftshift() rearranges PSF. This code was just copied from tutorial @ref tutorial_discrete_fourier_transform "Discrete Fourier Transform":
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp fftshift
A function filter2DFreq() filters an blurred image in frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp filter2DFreq
Result
------
Below you can see real out-of-focus image:
![Out-of-focus image](images/original.jpg)
Below result was done by \f$R\f$ = 53 and \f$SNR\f$ = 5200 parameters:
![The restored (deblurred) image](images/recovered.jpg)
The Wiener filter was used, values of \f$R\f$ and \f$SNR\f$ were selected manually to give the best possible visual result.
We can see that the result is not perfect, but it gives us a hint to the image content. With some difficulty, the text is readable.
@note The parameter \f$R\f$ is the most important. So you should adjust \f$R\f$ first, then \f$SNR\f$.
@note Sometimes you can observe the ringing effect in an restored image. This effect can be reduced by several methods. For example, you can taper input image edges.
You can also find a quick video demonstration of this on
[YouTube](https://youtu.be/0bEcE4B0XP4).
@youtube{0bEcE4B0XP4}
References
------
- [Image Deblurring in Matlab] - Image Deblurring in Matlab
- [SmartDeblur] - SmartDeblur site
<!-- invisible references list -->
[Digital Image Processing]: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/Digital_Image_Processing_2ndEd.pdf
[Image Deblurring in Matlab]: https://www.mathworks.com/help/images/image-deblurring.html
[SmartDeblur]: http://yuzhikov.com/articles/BlurredImagesRestoration1.htm

@ -292,3 +292,13 @@ In this section you will learn about the image processing (manipulation) functio
*Author:* Theodore Tsesmelis *Author:* Theodore Tsesmelis
Where we learn to segment objects using Laplacian filtering, the Distance Transformation and the Watershed algorithm. Where we learn to segment objects using Laplacian filtering, the Distance Transformation and the Watershed algorithm.
- @subpage tutorial_out_of_focus_deblur_filter
*Languages:* C++
*Compatibility:* \> OpenCV 2.0
*Author:* Karpushin Vladislav
You will learn how to recover an out-of-focus image by Wiener filter.

@ -60,6 +60,17 @@
// access from within opencv code more accessible // access from within opencv code more accessible
namespace cv { namespace cv {
namespace hal {
enum StoreMode
{
STORE_UNALIGNED = 0,
STORE_ALIGNED = 1,
STORE_ALIGNED_NOCACHE = 2
};
}
template<typename _Tp> struct V_TypeTraits template<typename _Tp> struct V_TypeTraits
{ {
}; };
@ -154,7 +165,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load(). // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
// Correspondingly, the wide intrinsics (which are mapped to the "widest" // Correspondingly, the wide intrinsics (which are mapped to the "widest"
// available instruction set) will get vx_ prefix // available instruction set) will get vx_ prefix
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load()) // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2 #if CV_AVX2
#include "opencv2/core/hal/intrin_avx.hpp" #include "opencv2/core/hal/intrin_avx.hpp"
@ -214,14 +225,16 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \ inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \ inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \ inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \ inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); } inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \ #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); } inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \ #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \ CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@ -316,7 +329,7 @@ template<typename _Tp> struct V_RegTraits
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load) CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
inline void vx_cleanup() { v256_cleanup(); } inline void vx_cleanup() { v256_cleanup(); }
#elif CV_SIMD128 #elif CV_SIMD128 || CV_SIMD128_CPP
typedef v_uint8x16 v_uint8; typedef v_uint8x16 v_uint8;
typedef v_int8x16 v_int8; typedef v_int8x16 v_int8;
typedef v_uint16x8 v_uint16; typedef v_uint16x8 v_uint16;

@ -304,6 +304,17 @@ inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1
{ _mm256_storeu_si256((__m256i*)ptr, a.val); } \ { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm256_store_si256((__m256i*)ptr, a.val); } \ { _mm256_store_si256((__m256i*)ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ _mm256_stream_si256((__m256i*)ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ \
if( mode == hal::STORE_UNALIGNED ) \
_mm256_storeu_si256((__m256i*)ptr, a.val); \
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
_mm256_stream_si256((__m256i*)ptr, a.val); \
else \
_mm256_store_si256((__m256i*)ptr, a.val); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \ { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -338,6 +349,17 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
{ _mm256_storeu_##suffix(ptr, a.val); } \ { _mm256_storeu_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm256_store_##suffix(ptr, a.val); } \ { _mm256_store_##suffix(ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ _mm256_stream_##suffix(ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ \
if( mode == hal::STORE_UNALIGNED ) \
_mm256_storeu_##suffix(ptr, a.val); \
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
_mm256_stream_##suffix(ptr, a.val); \
else \
_mm256_store_##suffix(ptr, a.val); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \ { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -407,6 +429,11 @@ inline v_float16x16 v256_load_f16(const short* ptr)
inline v_float16x16 v256_load_f16_aligned(const short* ptr) inline v_float16x16 v256_load_f16_aligned(const short* ptr)
{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); } { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
inline v_float16x16 v256_load_f16_low(const short* ptr)
{ return v_float16x16(v256_load_low(ptr).val); }
inline v_float16x16 v256_load_f16_halves(const short* ptr0, const short* ptr1)
{ return v_float16x16(v256_load_halves(ptr0, ptr1).val); }
inline void v_store(short* ptr, const v_float16x16& a) inline void v_store(short* ptr, const v_float16x16& a)
{ _mm256_storeu_si256((__m256i*)ptr, a.val); } { _mm256_storeu_si256((__m256i*)ptr, a.val); }
inline void v_store_aligned(short* ptr, const v_float16x16& a) inline void v_store_aligned(short* ptr, const v_float16x16& a)
@ -819,93 +846,79 @@ OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
template<int imm> template<int imm>
inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b) inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
{ {
__m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03); enum {IMM_R = (16 - imm) & 0xFF};
enum {IMM_R2 = (32 - imm) & 0xFF};
switch(imm)
{
case 0: return a;
case 32: return b;
case 16: return v_uint8x32(swap);
}
if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm)); if (imm == 0) return a;
if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm)); if (imm == 32) return b;
if (imm > 32) return v_uint8x32();
return v_uint8x32(); __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
if (imm == 16) return v_uint8x32(swap);
if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
} }
template<int imm> template<int imm>
inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b) inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
{ {
__m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21); enum {IMM_L = (imm - 16) & 0xFF};
switch(imm) if (imm == 0) return a;
{ if (imm == 32) return b;
case 0: return a; if (imm > 32) return v_uint8x32();
case 32: return b;
case 16: return v_uint8x32(swap);
}
__m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
if (imm == 16) return v_uint8x32(swap);
if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm)); if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16)); return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
return v_uint8x32();
} }
template<int imm> template<int imm>
inline v_uint8x32 v_rotate_left(const v_uint8x32& a) inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
{ {
v_uint8x32 res; enum {IMM_L = (imm - 16) & 0xFF};
enum {IMM_R = (16 - imm) & 0xFF};
if (imm == 0) return a;
if (imm > 32) return v_uint8x32();
// ESAC control[3] ? [127:0] = 0 // ESAC control[3] ? [127:0] = 0
__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0)); __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
if (imm == 16) return v_uint8x32(swapz);
if (imm == 0) if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
return a; return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
if (imm == 16)
res.val = swapz;
else if (imm < 16)
res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
else if (imm < 32)
res.val = _mm256_slli_si256(swapz, imm - 16);
else
return v_uint8x32();
return res;
} }
template<int imm> template<int imm>
inline v_uint8x32 v_rotate_right(const v_uint8x32& a) inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
{ {
v_uint8x32 res; enum {IMM_L = (imm - 16) & 0xFF};
if (imm == 0) return a;
if (imm > 32) return v_uint8x32();
// ESAC control[3] ? [127:0] = 0 // ESAC control[3] ? [127:0] = 0
__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1)); __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
if (imm == 16) return v_uint8x32(swapz);
if (imm == 0) if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
return a; return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
if (imm == 16)
res.val = swapz;
else if (imm < 16)
res.val = _mm256_alignr_epi8(swapz, a.val, imm);
else if (imm < 32)
res.val = _mm256_srli_si256(swapz, imm - 16);
else
return v_uint8x32();
return res;
} }
#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \ #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
template<int imm> \ template<int imm> \
inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
{ \ { \
const int w = sizeof(typename _Tpvec::lane_type); \ enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a), \ v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
v_reinterpret_as_u8(b)); \ v_reinterpret_as_u8(b)); \
return _Tpvec(cast(ret.val)); \ return _Tpvec(cast(ret.val)); \
} \ } \
template<int imm> \ template<int imm> \
inline _Tpvec intrin(const _Tpvec& a) \ inline _Tpvec intrin(const _Tpvec& a) \
{ \ { \
const int w = sizeof(typename _Tpvec::lane_type); \ enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \ v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
return _Tpvec(cast(ret.val)); \ return _Tpvec(cast(ret.val)); \
} }
@ -1616,7 +1629,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh); __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh); __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@ -1633,7 +1646,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh); __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh); __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@ -1683,16 +1696,16 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1); -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
static const __m256i const __m256i
sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13), 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
@ -1717,18 +1730,18 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
b0 = _mm256_shuffle_epi8(b0, sh_b); b0 = _mm256_shuffle_epi8(b0, sh_b);
g0 = _mm256_shuffle_epi8(g0, sh_g); g0 = _mm256_shuffle_epi8(g0, sh_g);
@ -1785,7 +1798,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96)); __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh); __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
@ -1820,7 +1833,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48)); __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh); __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh); __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
@ -1901,7 +1914,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g
///////////////////////////// store interleave ///////////////////////////////////// ///////////////////////////// store interleave /////////////////////////////////////
inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y ) inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val); __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
@ -1909,11 +1923,25 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x3
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, xy0);
_mm256_stream_si256((__m256i*)(ptr + 32), xy1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, xy0);
_mm256_store_si256((__m256i*)(ptr + 32), xy1);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, xy0); _mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 32), xy1); _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
}
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y ) inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val); __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
@ -1921,11 +1949,25 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint1
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, xy0);
_mm256_stream_si256((__m256i*)(ptr + 16), xy1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, xy0);
_mm256_store_si256((__m256i*)(ptr + 16), xy1);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, xy0); _mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 16), xy1); _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
}
} }
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y ) inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val); __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
@ -1933,11 +1975,25 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, xy0);
_mm256_stream_si256((__m256i*)(ptr + 8), xy1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, xy0);
_mm256_store_si256((__m256i*)(ptr + 8), xy1);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, xy0); _mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 8), xy1); _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
}
} }
inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y ) inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val); __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
@ -1945,19 +2001,33 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, xy0);
_mm256_stream_si256((__m256i*)(ptr + 4), xy1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, xy0);
_mm256_store_si256((__m256i*)(ptr + 4), xy1);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, xy0); _mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 4), xy1); _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
}
} }
inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r ) inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
static const __m256i sh_b = _mm256_setr_epi8( const __m256i sh_b = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
static const __m256i sh_g = _mm256_setr_epi8( const __m256i sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
static const __m256i sh_r = _mm256_setr_epi8( const __m256i sh_r = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
@ -1965,9 +2035,9 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@ -1978,20 +2048,36 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
__m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16); __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
__m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16); __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgr0);
_mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
_mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgr0);
_mm256_store_si256((__m256i*)(ptr + 32), bgr1);
_mm256_store_si256((__m256i*)(ptr + 64), bgr2);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgr0); _mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr1); _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
_mm256_storeu_si256((__m256i*)(ptr + 64), bgr2); _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
}
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r ) inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
static const __m256i sh_b = _mm256_setr_epi8( const __m256i sh_b = _mm256_setr_epi8(
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m256i sh_g = _mm256_setr_epi8( const __m256i sh_g = _mm256_setr_epi8(
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
static const __m256i sh_r = _mm256_setr_epi8( const __m256i sh_r = _mm256_setr_epi8(
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
@ -1999,9 +2085,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@ -2012,12 +2098,28 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
//__m256i bgr1 = p1; //__m256i bgr1 = p1;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16); __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgr0);
_mm256_stream_si256((__m256i*)(ptr + 16), p1);
_mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgr0);
_mm256_store_si256((__m256i*)(ptr + 16), p1);
_mm256_store_si256((__m256i*)(ptr + 32), bgr2);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgr0); _mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 16), p1); _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr2); _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
}
} }
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r ) inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c); __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
__m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1); __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
@ -2031,12 +2133,28 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
//__m256i bgr1 = p2; //__m256i bgr1 = p2;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgr0);
_mm256_stream_si256((__m256i*)(ptr + 8), p2);
_mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgr0);
_mm256_store_si256((__m256i*)(ptr + 8), p2);
_mm256_store_si256((__m256i*)(ptr + 16), bgr2);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgr0); _mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 8), p2); _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgr2); _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
}
} }
inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r ) inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i s01 = _mm256_unpacklo_epi64(b.val, g.val); __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i s12 = _mm256_unpackhi_epi64(g.val, r.val); __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
@ -2046,12 +2164,29 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f); __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
__m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16); __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgr0);
_mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
_mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgr0);
_mm256_store_si256((__m256i*)(ptr + 4), bgr1);
_mm256_store_si256((__m256i*)(ptr + 8), bgr2);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgr0); _mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 4), bgr1); _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgr2); _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
}
} }
inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a ) inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
const v_uint8x32& r, const v_uint8x32& a,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val); __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
@ -2068,14 +2203,32 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgra0);
_mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
_mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
_mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgra0);
_mm256_store_si256((__m256i*)(ptr + 32), bgra1);
_mm256_store_si256((__m256i*)(ptr + 64), bgra2);
_mm256_store_si256((__m256i*)(ptr + 96), bgra3);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgra0); _mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra1); _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 64), bgra2); _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 96), bgra3); _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
}
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
const v_uint16x16& r, const v_uint16x16& a ) const v_uint16x16& r, const v_uint16x16& a,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val); __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
@ -2092,14 +2245,32 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgra0);
_mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
_mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
_mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgra0);
_mm256_store_si256((__m256i*)(ptr + 16), bgra1);
_mm256_store_si256((__m256i*)(ptr + 32), bgra2);
_mm256_store_si256((__m256i*)(ptr + 48), bgra3);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgra0); _mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra1); _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra2); _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 48), bgra3); _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
}
} }
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
const v_uint32x8& r, const v_uint32x8& a ) const v_uint32x8& r, const v_uint32x8& a,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val); __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
@ -2116,14 +2287,32 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgra0);
_mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
_mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
_mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgra0);
_mm256_store_si256((__m256i*)(ptr + 8), bgra1);
_mm256_store_si256((__m256i*)(ptr + 16), bgra2);
_mm256_store_si256((__m256i*)(ptr + 24), bgra3);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgra0); _mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra1); _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra2); _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 24), bgra3); _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
}
} }
inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
const v_uint64x4& r, const v_uint64x4& a ) const v_uint64x4& r, const v_uint64x4& a,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{ {
__m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val); __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
@ -2135,10 +2324,27 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
__m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16); __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
__m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16); __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm256_stream_si256((__m256i*)ptr, bgra0);
_mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
_mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
_mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm256_store_si256((__m256i*)ptr, bgra0);
_mm256_store_si256((__m256i*)(ptr + 4), bgra1);
_mm256_store_si256((__m256i*)(ptr + 8), bgra2);
_mm256_store_si256((__m256i*)(ptr + 12), bgra3);
}
else
{
_mm256_storeu_si256((__m256i*)ptr, bgra0); _mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 4), bgra1); _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra2); _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 12), bgra3); _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
}
} }
#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@ -2166,27 +2372,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
c0 = v_reinterpret_as_##suffix0(c1); \ c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \ d0 = v_reinterpret_as_##suffix0(d1); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
v_store_interleave((_Tp1*)ptr, a1, b1); \ v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
const _Tpvec0& c0, const _Tpvec0& d0 ) \ const _Tpvec0& c0, const _Tpvec0& d0, \
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
} }
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8) OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)

@ -1319,7 +1319,8 @@ Scheme:
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b) const v_reg<_Tp, n>& b,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{ {
int i, i2; int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 ) for( i = i2 = 0; i < n; i++, i2 += 2 )
@ -1339,7 +1340,8 @@ Scheme:
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{ {
int i, i3; int i, i3;
for( i = i3 = 0; i < n; i++, i3 += 3 ) for( i = i3 = 0; i < n; i++, i3 += 3 )
@ -1360,7 +1362,8 @@ Scheme:
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
const v_reg<_Tp, n>& d) const v_reg<_Tp, n>& d,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{ {
int i, i4; int i, i4;
for( i = i4 = 0; i < n; i++, i4 += 4 ) for( i = i4 = 0; i < n; i++, i4 += 4 )
@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr[i] = a.s[i]; ptr[i] = a.s[i];
} }
template<typename _Tp, int n>
inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
{
for( int i = 0; i < n; i++ )
ptr[i] = a.s[i];
}
template<typename _Tp, int n>
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
{
for( int i = 0; i < n; i++ )
ptr[i] = a.s[i];
}
/** @brief Combine vector from first elements of two vectors /** @brief Combine vector from first elements of two vectors
Scheme: Scheme:

@ -319,6 +319,9 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a)
#endif #endif
} }
#ifndef vdup_n_f16
#define vdup_n_f16(v) (float16x4_t){v, v, v, v}
#endif
struct v_float16x8 struct v_float16x8
{ {
@ -864,6 +867,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \ { vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \ { vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -889,6 +896,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
inline v_float16x8 v_load_f16_aligned(const short* ptr) inline v_float16x8 v_load_f16_aligned(const short* ptr)
{ return v_float16x8(cv_vld1q_f16(ptr)); } { return v_float16x8(cv_vld1q_f16(ptr)); }
inline v_float16x8 v_load_f16_low(const short* ptr)
{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
inline void v_store(short* ptr, const v_float16x8& a) inline void v_store(short* ptr, const v_float16x8& a)
{ cv_vst1q_f16(ptr, a.val); } { cv_vst1q_f16(ptr, a.val); }
inline void v_store_aligned(short* ptr, const v_float16x8& a) inline void v_store_aligned(short* ptr, const v_float16x8& a)
@ -1292,14 +1304,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \ c.val = v.val[2]; \
d.val = v.val[3]; \ d.val = v.val[3]; \
} \ } \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \ { \
_Tpvec##x2_t v; \ _Tpvec##x2_t v; \
v.val[0] = a.val; \ v.val[0] = a.val; \
v.val[1] = b.val; \ v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \ vst2q_##suffix(ptr, v); \
} \ } \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \ { \
_Tpvec##x3_t v; \ _Tpvec##x3_t v; \
v.val[0] = a.val; \ v.val[0] = a.val; \
@ -1308,7 +1322,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
vst3q_##suffix(ptr, v); \ vst3q_##suffix(ptr, v); \
} \ } \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, const v_##_Tpvec& d) \ const v_##_Tpvec& c, const v_##_Tpvec& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec##x4_t v; \ _Tpvec##x4_t v; \
v.val[0] = a.val; \ v.val[0] = a.val; \
@ -1360,7 +1375,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
d = v_##tp##x2(vcombine_##suffix(d0, d1)); \ d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
} \ } \
\ \
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \ { \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@ -1369,7 +1385,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
} \ } \
\ \
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
const v_##tp##x2& b, const v_##tp##x2& c ) \ const v_##tp##x2& b, const v_##tp##x2& c, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \ { \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@ -1380,7 +1397,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
} \ } \
\ \
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
const v_##tp##x2& c, const v_##tp##x2& d ) \ const v_##tp##x2& c, const v_##tp##x2& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \ { \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \

@ -788,7 +788,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
inline v_float32x4 v_invsqrt(const v_float32x4& x) inline v_float32x4 v_invsqrt(const v_float32x4& x)
{ {
static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
__m128 t = x.val; __m128 t = x.val;
__m128 h = _mm_mul_ps(t, _0_5); __m128 h = _mm_mul_ps(t, _0_5);
t = _mm_rsqrt_ps(t); t = _mm_rsqrt_ps(t);
@ -801,7 +801,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
inline v_float64x2 v_invsqrt(const v_float64x2& x) inline v_float64x2 v_invsqrt(const v_float64x2& x)
{ {
static const __m128d v_1 = _mm_set1_pd(1.); const __m128d v_1 = _mm_set1_pd(1.);
return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
} }
@ -1261,6 +1261,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_si128((__m128i*)ptr, a.val); } \ { _mm_storeu_si128((__m128i*)ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm_store_si128((__m128i*)ptr, a.val); } \ { _mm_store_si128((__m128i*)ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ _mm_stream_si128((__m128i*)ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ \
if( mode == hal::STORE_UNALIGNED ) \
_mm_storeu_si128((__m128i*)ptr, a.val); \
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
_mm_stream_si128((__m128i*)ptr, a.val); \
else \
_mm_store_si128((__m128i*)ptr, a.val); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storel_epi64((__m128i*)ptr, a.val); } \ { _mm_storel_epi64((__m128i*)ptr, a.val); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1292,6 +1303,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_##suffix(ptr, a.val); } \ { _mm_storeu_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm_store_##suffix(ptr, a.val); } \ { _mm_store_##suffix(ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ _mm_stream_##suffix(ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ \
if( mode == hal::STORE_UNALIGNED ) \
_mm_storeu_##suffix(ptr, a.val); \
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
_mm_stream_##suffix(ptr, a.val); \
else \
_mm_store_##suffix(ptr, a.val); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1308,6 +1330,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
inline v_float16x8 v_load_f16_aligned(const short* ptr) inline v_float16x8 v_load_f16_aligned(const short* ptr)
{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); } { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
inline v_float16x8 v_load_f16_low(const short* ptr)
{ return v_float16x8(v_load_low(ptr).val); }
inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
{ return v_float16x8(v_load_halves(ptr0, ptr1).val); }
inline void v_store(short* ptr, const v_float16x8& a) inline void v_store(short* ptr, const v_float16x8& a)
{ _mm_storeu_si128((__m128i*)ptr, a.val); } { _mm_storeu_si128((__m128i*)ptr, a.val); }
inline void v_store_aligned(short* ptr, const v_float16x8& a) inline void v_store_aligned(short* ptr, const v_float16x8& a)
@ -1671,17 +1698,17 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{ {
#if CV_SSE4_1 #if CV_SSE4_1
static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i s0 = _mm_loadu_si128((const __m128i*)ptr); __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
__m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1); __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
__m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1); __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
__m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1); __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14); const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
a0 = _mm_shuffle_epi8(a0, sh_b); a0 = _mm_shuffle_epi8(a0, sh_b);
b0 = _mm_shuffle_epi8(b0, sh_g); b0 = _mm_shuffle_epi8(b0, sh_g);
c0 = _mm_shuffle_epi8(c0, sh_r); c0 = _mm_shuffle_epi8(c0, sh_r);
@ -1689,9 +1716,9 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
b.val = b0; b.val = b0;
c.val = c0; c.val = c0;
#elif CV_SSSE3 #elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr); __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@ -1784,9 +1811,9 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
__m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24); __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
__m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24); __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
a0 = _mm_shuffle_epi8(a0, sh_a); a0 = _mm_shuffle_epi8(a0, sh_a);
b0 = _mm_shuffle_epi8(b0, sh_b); b0 = _mm_shuffle_epi8(b0, sh_b);
c0 = _mm_shuffle_epi8(c0, sh_c); c0 = _mm_shuffle_epi8(c0, sh_c);
@ -1955,55 +1982,61 @@ inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
// store interleave // store interleave
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b) inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i v0 = _mm_unpacklo_epi8(a.val, b.val); __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
__m128i v1 = _mm_unpackhi_epi8(a.val, b.val); __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 16), v1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 16), v1);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1); _mm_storeu_si128((__m128i*)(ptr + 16), v1);
}
} }
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c ) const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
#if CV_SSE4_1 #if CV_SSE4_1
static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a); __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b); __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c); __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0); __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
__m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0); __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
__m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0); __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
#elif CV_SSSE3 #elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
__m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5); __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
t0 = _mm_alignr_epi8(c.val, t0, 5); t0 = _mm_alignr_epi8(c.val, t0, 5);
__m128i s0 = _mm_shuffle_epi8(t0, m0); __m128i v0 = _mm_shuffle_epi8(t0, m0);
__m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6); __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5); t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
__m128i s1 = _mm_shuffle_epi8(t1, m1); __m128i v1 = _mm_shuffle_epi8(t1, m1);
__m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11); __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
t2 = _mm_alignr_epi8(t2, a.val, 11); t2 = _mm_alignr_epi8(t2, a.val, 11);
__m128i s2 = _mm_shuffle_epi8(t2, m2); __m128i v2 = _mm_shuffle_epi8(t2, m2);
_mm_storeu_si128((__m128i*)ptr, s0);
_mm_storeu_si128((__m128i*)(ptr + 16), s1);
_mm_storeu_si128((__m128i*)(ptr + 32), s2);
#else #else
__m128i z = _mm_setzero_si128(); __m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
@ -2042,15 +2075,31 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
__m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
__m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
__m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
#endif
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 16), v1);
_mm_stream_si128((__m128i*)(ptr + 32), v2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 16), v1);
_mm_store_si128((__m128i*)(ptr + 32), v2);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1); _mm_storeu_si128((__m128i*)(ptr + 16), v1);
_mm_storeu_si128((__m128i*)(ptr + 32), v2); _mm_storeu_si128((__m128i*)(ptr + 32), v2);
#endif }
} }
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c, const v_uint8x16& d) const v_uint8x16& c, const v_uint8x16& d,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
// a0 a1 a2 a3 .... // a0 a1 a2 a3 ....
// b0 b1 b2 b3 .... // b0 b1 b2 b3 ....
@ -2062,33 +2111,64 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
__m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
__m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
__m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
__m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
__m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
_mm_storeu_si128((__m128i*)ptr, v0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 16), v2); {
_mm_storeu_si128((__m128i*)(ptr + 32), v1); _mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 16), v1);
_mm_stream_si128((__m128i*)(ptr + 32), v2);
_mm_stream_si128((__m128i*)(ptr + 48), v3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 16), v1);
_mm_store_si128((__m128i*)(ptr + 32), v2);
_mm_store_si128((__m128i*)(ptr + 48), v3);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
_mm_storeu_si128((__m128i*)(ptr + 48), v3); _mm_storeu_si128((__m128i*)(ptr + 48), v3);
}
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b ) inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i t0, t1; __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
t0 = _mm_unpacklo_epi16(a.val, b.val); __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
t1 = _mm_unpackhi_epi16(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr), t0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 8), t1); {
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 8), v1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 8), v1);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
}
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
const v_uint16x8& b, const v_uint16x8& b, const v_uint16x8& c,
const v_uint16x8& c ) hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
#if CV_SSE4_1 #if CV_SSE4_1
static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a); __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b); __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c); __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
@ -2096,10 +2176,6 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
__m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24); __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
__m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24); __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
__m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24); __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
_mm_storeu_si128((__m128i*)ptr, v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
#else #else
__m128i z = _mm_setzero_si128(); __m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
@ -2128,15 +2204,30 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
__m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
__m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
__m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
#endif
if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 8), v1);
_mm_stream_si128((__m128i*)(ptr + 16), v2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 8), v1);
_mm_store_si128((__m128i*)(ptr + 16), v2);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1); _mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2); _mm_storeu_si128((__m128i*)(ptr + 16), v2);
#endif }
} }
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
const v_uint16x8& c, const v_uint16x8& d) const v_uint16x8& c, const v_uint16x8& d,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
// a0 a1 a2 a3 .... // a0 a1 a2 a3 ....
// b0 b1 b2 b3 .... // b0 b1 b2 b3 ....
@ -2148,27 +2239,58 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
__m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
__m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
__m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
__m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
__m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
_mm_storeu_si128((__m128i*)ptr, v0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 8), v2); {
_mm_storeu_si128((__m128i*)(ptr + 16), v1); _mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 8), v1);
_mm_stream_si128((__m128i*)(ptr + 16), v2);
_mm_stream_si128((__m128i*)(ptr + 24), v3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 8), v1);
_mm_store_si128((__m128i*)(ptr + 16), v2);
_mm_store_si128((__m128i*)(ptr + 24), v3);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
_mm_storeu_si128((__m128i*)(ptr + 24), v3); _mm_storeu_si128((__m128i*)(ptr + 24), v3);
}
} }
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b ) inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i t0 = _mm_unpacklo_epi32(a.val, b.val); __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
__m128i t1 = _mm_unpackhi_epi32(a.val, b.val); __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
_mm_storeu_si128((__m128i*)ptr, t0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 4), t1); {
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 4), v1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 4), v1);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 4), v1);
}
} }
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c ) const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
v_transpose4x4(a, b, c, z, u0, u1, u2, u3); v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
@ -2177,35 +2299,82 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint
__m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
__m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
_mm_storeu_si128((__m128i*)ptr, v0); if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 4), v1);
_mm_stream_si128((__m128i*)(ptr + 8), v2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 4), v1);
_mm_store_si128((__m128i*)(ptr + 8), v2);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 4), v1); _mm_storeu_si128((__m128i*)(ptr + 4), v1);
_mm_storeu_si128((__m128i*)(ptr + 8), v2); _mm_storeu_si128((__m128i*)(ptr + 8), v2);
}
} }
inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c, const v_uint32x4& d) const v_uint32x4& c, const v_uint32x4& d,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
v_uint32x4 t0, t1, t2, t3; v_uint32x4 v0, v1, v2, v3;
v_transpose4x4(a, b, c, d, t0, t1, t2, t3); v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
v_store(ptr, t0);
v_store(ptr + 4, t1); if( mode == hal::STORE_ALIGNED_NOCACHE )
v_store(ptr + 8, t2); {
v_store(ptr + 12, t3); _mm_stream_si128((__m128i*)(ptr), v0.val);
_mm_stream_si128((__m128i*)(ptr + 4), v1.val);
_mm_stream_si128((__m128i*)(ptr + 8), v2.val);
_mm_stream_si128((__m128i*)(ptr + 12), v3.val);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0.val);
_mm_store_si128((__m128i*)(ptr + 4), v1.val);
_mm_store_si128((__m128i*)(ptr + 8), v2.val);
_mm_store_si128((__m128i*)(ptr + 12), v3.val);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0.val);
_mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
_mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
_mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
}
} }
// 2-channel, float only // 2-channel, float only
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b) inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
// a0 a1 a2 a3 ... __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
// b0 b1 b2 b3 ... __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
_mm_storeu_ps(ptr, u0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_ps((ptr + 4), u1); {
_mm_stream_ps(ptr, v0);
_mm_stream_ps(ptr + 4, v1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_ps(ptr, v0);
_mm_store_ps(ptr + 4, v1);
}
else
{
_mm_storeu_ps(ptr, v0);
_mm_storeu_ps(ptr + 4, v1);
}
} }
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0)); __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
__m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0)); __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
@ -2217,13 +2386,29 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
__m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3)); __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
__m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0)); __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
_mm_storeu_ps(ptr + 0, v0); if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_ps(ptr, v0);
_mm_stream_ps(ptr + 4, v1);
_mm_stream_ps(ptr + 8, v2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_ps(ptr, v0);
_mm_store_ps(ptr + 4, v1);
_mm_store_ps(ptr + 8, v2);
}
else
{
_mm_storeu_ps(ptr, v0);
_mm_storeu_ps(ptr + 4, v1); _mm_storeu_ps(ptr + 4, v1);
_mm_storeu_ps(ptr + 8, v2); _mm_storeu_ps(ptr + 8, v2);
}
} }
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d) const v_float32x4& c, const v_float32x4& d,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128 u0 = _mm_unpacklo_ps(a.val, c.val); __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
__m128 u1 = _mm_unpacklo_ps(b.val, d.val); __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
@ -2234,43 +2419,109 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
__m128 v1 = _mm_unpackhi_ps(u0, u1); __m128 v1 = _mm_unpackhi_ps(u0, u1);
__m128 v3 = _mm_unpackhi_ps(u2, u3); __m128 v3 = _mm_unpackhi_ps(u2, u3);
_mm_storeu_ps(ptr + 0, v0); if( mode == hal::STORE_ALIGNED_NOCACHE )
{
_mm_stream_ps(ptr, v0);
_mm_stream_ps(ptr + 4, v1);
_mm_stream_ps(ptr + 8, v2);
_mm_stream_ps(ptr + 12, v3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_ps(ptr, v0);
_mm_store_ps(ptr + 4, v1);
_mm_store_ps(ptr + 8, v2);
_mm_store_ps(ptr + 12, v3);
}
else
{
_mm_storeu_ps(ptr, v0);
_mm_storeu_ps(ptr + 4, v1); _mm_storeu_ps(ptr + 4, v1);
_mm_storeu_ps(ptr + 8, v2); _mm_storeu_ps(ptr + 8, v2);
_mm_storeu_ps(ptr + 12, v3); _mm_storeu_ps(ptr + 12, v3);
}
} }
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b) inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val); __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpackhi_epi64(a.val, b.val); __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
_mm_storeu_si128((__m128i*)ptr, t0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 2), t1); {
_mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 2), v1);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 2), v1);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
}
} }
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val); __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val)); __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
__m128i t2 = _mm_unpackhi_epi64(b.val, c.val); __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
_mm_storeu_si128((__m128i*)ptr, t0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 2), t1); {
_mm_storeu_si128((__m128i*)(ptr + 4), t2); _mm_stream_si128((__m128i*)(ptr), v0);
_mm_stream_si128((__m128i*)(ptr + 2), v1);
_mm_stream_si128((__m128i*)(ptr + 4), v2);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 2), v1);
_mm_store_si128((__m128i*)(ptr + 4), v2);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
_mm_storeu_si128((__m128i*)(ptr + 4), v2);
}
} }
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d) inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
const v_uint64x2& c, const v_uint64x2& d,
hal::StoreMode mode = hal::STORE_UNALIGNED)
{ {
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val); __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpacklo_epi64(c.val, d.val); __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
__m128i t2 = _mm_unpackhi_epi64(a.val, b.val); __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
__m128i t3 = _mm_unpackhi_epi64(c.val, d.val); __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
_mm_storeu_si128((__m128i*)ptr, t0); if( mode == hal::STORE_ALIGNED_NOCACHE )
_mm_storeu_si128((__m128i*)(ptr + 2), t1); {
_mm_storeu_si128((__m128i*)(ptr + 4), t2); _mm_stream_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 6), t3); _mm_stream_si128((__m128i*)(ptr + 2), v1);
_mm_stream_si128((__m128i*)(ptr + 4), v2);
_mm_stream_si128((__m128i*)(ptr + 6), v3);
}
else if( mode == hal::STORE_ALIGNED )
{
_mm_store_si128((__m128i*)(ptr), v0);
_mm_store_si128((__m128i*)(ptr + 2), v1);
_mm_store_si128((__m128i*)(ptr + 4), v2);
_mm_store_si128((__m128i*)(ptr + 6), v3);
}
else
{
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
_mm_storeu_si128((__m128i*)(ptr + 4), v2);
_mm_storeu_si128((__m128i*)(ptr + 6), v3);
}
} }
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@ -2298,27 +2549,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
c0 = v_reinterpret_as_##suffix0(c1); \ c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \ d0 = v_reinterpret_as_##suffix0(d1); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
v_store_interleave((_Tp1*)ptr, a1, b1); \ v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
} \ } \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
const _Tpvec0& c0, const _Tpvec0& d0 ) \ const _Tpvec0& c0, const _Tpvec0& d0, \
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \ { \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
} }
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)

@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ st(a.val, 0, ptr); } \ { st(a.val, 0, ptr); } \
inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \ inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
{ st_a(a.val, 0, ptr); } \ { st_a(a.val, 0, ptr); } \
inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
{ st_a(a.val, 0, ptr); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vec_st_l8(a.val, ptr); } \ { vec_st_l8(a.val, ptr); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
_Tpvec& c, _Tpvec& d) \ _Tpvec& c, _Tpvec& d) \
{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \ { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, ptr); } \ { vec_st_interleave(a.val, b.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
const _Tpvec& b, const _Tpvec& c) \ const _Tpvec& b, const _Tpvec& c, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, ptr); } \ { vec_st_interleave(a.val, b.val, c.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
const _Tpvec& c, const _Tpvec& d) \ const _Tpvec& c, const _Tpvec& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); } { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16) OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)

@ -457,6 +457,18 @@ Returns empty string if feature is not defined
*/ */
CV_EXPORTS_W String getHardwareFeatureName(int feature); CV_EXPORTS_W String getHardwareFeatureName(int feature);
/** @brief Returns list of CPU features enabled during compilation.
Returned value is a string containing space separated list of CPU features with following markers:
- no markers - baseline features
- prefix `*` - features enabled in dispatcher
- suffix `?` - features enabled but not available in HW
Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
*/
CV_EXPORTS std::string getCPUFeaturesLine();
/** @brief Returns the number of logical CPUs available for the process. /** @brief Returns the number of logical CPUs available for the process.
*/ */
CV_EXPORTS_W int getNumberOfCPUs(); CV_EXPORTS_W int getNumberOfCPUs();

@ -1180,7 +1180,8 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ || CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
op == CMP_NE || op == CMP_GE || op == CMP_GT ); op == CMP_NE || op == CMP_GE || op == CMP_GT );
if(_src1.empty() || _src2.empty()) CV_Assert(_src1.empty() == _src2.empty());
if (_src1.empty() && _src2.empty())
{ {
_dst.release(); _dst.release();
return; return;

@ -411,7 +411,8 @@ Mat& Mat::operator = (const Scalar& s)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
if (empty()) return *this; if (this->empty())
return *this;
const Mat* arrays[] = { this }; const Mat* arrays[] = { this };
uchar* dptr; uchar* dptr;

@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )
#if CV_SIMD #if CV_SIMD
const int VECSZ = v_float32::nlanes; const int VECSZ = v_float32::nlanes;
static const v_float32 vprescale = vx_setall_f32((float)exp_prescale); const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
static const v_float32 vminval = vx_setall_f32(minval); const v_float32 vminval = vx_setall_f32(minval);
static const v_float32 vmaxval = vx_setall_f32(maxval); const v_float32 vmaxval = vx_setall_f32(maxval);
static const v_float32 vA1 = vx_setall_f32((float)A1); const v_float32 vA1 = vx_setall_f32((float)A1);
static const v_float32 vA2 = vx_setall_f32((float)A2); const v_float32 vA2 = vx_setall_f32((float)A2);
static const v_float32 vA3 = vx_setall_f32((float)A3); const v_float32 vA3 = vx_setall_f32((float)A3);
static const v_float32 vA4 = vx_setall_f32((float)A4); const v_float32 vA4 = vx_setall_f32((float)A4);
static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0; bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 ) for( ; i < n; i += VECSZ*2 )
@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )
#if CV_SIMD_64F #if CV_SIMD_64F
const int VECSZ = v_float64::nlanes; const int VECSZ = v_float64::nlanes;
static const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vprescale = vx_setall_f64(exp_prescale);
static const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vpostscale = vx_setall_f64(exp_postscale);
static const v_float64 vminval = vx_setall_f64(minval); const v_float64 vminval = vx_setall_f64(minval);
static const v_float64 vmaxval = vx_setall_f64(maxval); const v_float64 vmaxval = vx_setall_f64(maxval);
static const v_float64 vA1 = vx_setall_f64(A1); const v_float64 vA1 = vx_setall_f64(A1);
static const v_float64 vA2 = vx_setall_f64(A2); const v_float64 vA2 = vx_setall_f64(A2);
static const v_float64 vA3 = vx_setall_f64(A3); const v_float64 vA3 = vx_setall_f64(A3);
static const v_float64 vA4 = vx_setall_f64(A4); const v_float64 vA4 = vx_setall_f64(A4);
static const v_float64 vA5 = vx_setall_f64(A5); const v_float64 vA5 = vx_setall_f64(A5);
static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0; bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 ) for( ; i < n; i += VECSZ*2 )
@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )
#if CV_SIMD #if CV_SIMD
const int VECSZ = v_float32::nlanes; const int VECSZ = v_float32::nlanes;
static const v_float32 vln2 = vx_setall_f32((float)ln_2); const v_float32 vln2 = vx_setall_f32((float)ln_2);
static const v_float32 v1 = vx_setall_f32(1.f); const v_float32 v1 = vx_setall_f32(1.f);
static const v_float32 vshift = vx_setall_f32(-1.f/512); const v_float32 vshift = vx_setall_f32(-1.f/512);
static const v_float32 vA0 = vx_setall_f32(A0); const v_float32 vA0 = vx_setall_f32(A0);
static const v_float32 vA1 = vx_setall_f32(A1); const v_float32 vA1 = vx_setall_f32(A1);
static const v_float32 vA2 = vx_setall_f32(A2); const v_float32 vA2 = vx_setall_f32(A2);
for( ; i < n; i += VECSZ ) for( ; i < n; i += VECSZ )
{ {
@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )
#if CV_SIMD_64F #if CV_SIMD_64F
const int VECSZ = v_float64::nlanes; const int VECSZ = v_float64::nlanes;
static const v_float64 vln2 = vx_setall_f64(ln_2); const v_float64 vln2 = vx_setall_f64(ln_2);
static const v_float64 const v_float64
vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1), vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3), vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5), vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),

@ -602,13 +602,13 @@ void Mat::pop_back(size_t nelems)
void Mat::push_back_(const void* elem) void Mat::push_back_(const void* elem)
{ {
int r = size.p[0]; size_t r = size.p[0];
if( isSubmatrix() || dataend + step.p[0] > datalimit ) if( isSubmatrix() || dataend + step.p[0] > datalimit )
reserve( std::max(r + 1, (r*3+1)/2) ); reserve( std::max(r + 1, (r*3+1)/2) );
size_t esz = elemSize(); size_t esz = elemSize();
memcpy(data + r*step.p[0], elem, esz); memcpy(data + r*step.p[0], elem, esz);
size.p[0] = r + 1; size.p[0] = int(r + 1);
dataend += step.p[0]; dataend += step.p[0];
uint64 tsz = size.p[0]; uint64 tsz = size.p[0];
for( int i = 1; i < dims; i++ ) for( int i = 1; i < dims; i++ )
@ -709,7 +709,8 @@ void Mat::resize(size_t nelems, const Scalar& s)
void Mat::push_back(const Mat& elems) void Mat::push_back(const Mat& elems)
{ {
int r = size.p[0], delta = elems.size.p[0]; size_t r = size.p[0];
size_t delta = elems.size.p[0];
if( delta == 0 ) if( delta == 0 )
return; return;
if( this == &elems ) if( this == &elems )
@ -726,7 +727,7 @@ void Mat::push_back(const Mat& elems)
size.p[0] = elems.size.p[0]; size.p[0] = elems.size.p[0];
bool eq = size == elems.size; bool eq = size == elems.size;
size.p[0] = r; size.p[0] = int(r);
if( !eq ) if( !eq )
CV_Error(CV_StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length"); CV_Error(CV_StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length");
if( type() != elems.type() ) if( type() != elems.type() )
@ -735,7 +736,7 @@ void Mat::push_back(const Mat& elems)
if( isSubmatrix() || dataend + step.p[0]*delta > datalimit ) if( isSubmatrix() || dataend + step.p[0]*delta > datalimit )
reserve( std::max(r + delta, (r*3+1)/2) ); reserve( std::max(r + delta, (r*3+1)/2) );
size.p[0] += delta; size.p[0] += int(delta);
dataend += step.p[0]*delta; dataend += step.p[0]*delta;
//updateContinuityFlag(*this); //updateContinuityFlag(*this);
@ -744,7 +745,7 @@ void Mat::push_back(const Mat& elems)
memcpy(data + r*step.p[0], elems.data, elems.total()*elems.elemSize()); memcpy(data + r*step.p[0], elems.data, elems.total()*elems.elemSize());
else else
{ {
Mat part = rowRange(r, r + delta); Mat part = rowRange(int(r), int(r + delta));
elems.copyTo(part); elems.copyTo(part);
} }
} }

@ -766,11 +766,13 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CV_Assert(!_src.empty());
CV_Assert( _mask.empty() || _mask.type() == CV_8UC1 );
CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
ocl_meanStdDev(_src, _mean, _sdv, _mask)) ocl_meanStdDev(_src, _mean, _sdv, _mask))
Mat src = _src.getMat(), mask = _mask.getMat(); Mat src = _src.getMat(), mask = _mask.getMat();
CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows), CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
openvx_meanStdDev(src, _mean, _sdv, mask)) openvx_meanStdDev(src, _mean, _sdv, mask))

@ -9,21 +9,58 @@
namespace cv { namespace hal { namespace cv { namespace hal {
#if CV_SIMD #if CV_SIMD
/*
The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
on IA there are instructions movntps and such to which
v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
Those instructions write directly into memory w/o touching cache
that results in dramatic speed improvements, especially on
large arrays (FullHD, 4K etc.).
Those intrinsics require the destination address to be aligned
by 16/32 bits (with SSE2 and AVX2, respectively).
So we potentially split the processing into 3 stages:
1) the optional prefix part [0:i0), where we use simple unaligned stores.
2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
But in some cases we have to use unaligned stores in this part.
3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
to process the remaining len - VECSZ elements.
In principle there can be very poorly aligned data where there is no main part.
For that we set i0=0 and use unaligned stores for the whole array.
*/
template<typename T, typename VecT> static void template<typename T, typename VecT> static void
vecmerge_( const T** src, T* dst, int len, int cn ) vecmerge_( const T** src, T* dst, int len, int cn )
{ {
int i; const int VECSZ = VecT::nlanes;
int i, i0 = 0;
const T* src0 = src[0]; const T* src0 = src[0];
const T* src1 = src[1]; const T* src1 = src[1];
const int VECSZ = VecT::nlanes; int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
if( r != 0 )
{
mode = hal::STORE_UNALIGNED;
if( r % cn == 0 && len > VECSZ )
i0 = VECSZ - (r / cn);
}
if( cn == 2 ) if( cn == 2 )
{ {
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i); VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
v_store_interleave(dst + i*cn, a, b); v_store_interleave(dst + i*cn, a, b, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
else if( cn == 3 ) else if( cn == 3 )
@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
const T* src2 = src[2]; const T* src2 = src[2];
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i); VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
v_store_interleave(dst + i*cn, a, b, c); v_store_interleave(dst + i*cn, a, b, c, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
else else
@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
const T* src3 = src[3]; const T* src3 = src[3];
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i); VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
VecT c = vx_load(src2 + i), d = vx_load(src3 + i); VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
v_store_interleave(dst + i*cn, a, b, c, d); v_store_interleave(dst + i*cn, a, b, c, d, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
vx_cleanup(); vx_cleanup();

@ -2834,7 +2834,22 @@ extern "C" {
static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
{ {
try
{
((cv::ocl::Kernel::Impl*)p)->finit(e); ((cv::ocl::Kernel::Impl*)p)->finit(e);
}
catch (const cv::Exception& exc)
{
CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
}
catch (const std::exception& exc)
{
CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
}
catch (...)
{
CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
}
} }
} }

@ -511,8 +511,8 @@ static RandnScaleFunc randnScaleTab[] =
void RNG::fill( InputOutputArray _mat, int disttype, void RNG::fill( InputOutputArray _mat, int disttype,
InputArray _param1arg, InputArray _param2arg, bool saturateRange ) InputArray _param1arg, InputArray _param2arg, bool saturateRange )
{ {
if (_mat.empty()) CV_Assert(!_mat.empty());
return;
Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat(); Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
int depth = mat.depth(), cn = mat.channels(); int depth = mat.depth(), cn = mat.channels();
AutoBuffer<double> _parambuf; AutoBuffer<double> _parambuf;

@ -9,23 +9,46 @@
namespace cv { namespace hal { namespace cv { namespace hal {
#if CV_SIMD #if CV_SIMD
// see the comments for vecmerge_ in merge.cpp
template<typename T, typename VecT> static void template<typename T, typename VecT> static void
vecsplit_( const T* src, T** dst, int len, int cn ) vecsplit_( const T* src, T** dst, int len, int cn )
{ {
int i; const int VECSZ = VecT::nlanes;
int i, i0 = 0;
T* dst0 = dst[0]; T* dst0 = dst[0];
T* dst1 = dst[1]; T* dst1 = dst[1];
const int VECSZ = VecT::nlanes; int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
if( (r0|r1|r2|r3) != 0 )
{
mode = hal::STORE_UNALIGNED;
if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
i0 = VECSZ - (r0 / cn);
}
if( cn == 2 ) if( cn == 2 )
{ {
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b; VecT a, b;
v_load_deinterleave(src + i*cn, a, b); v_load_deinterleave(src + i*cn, a, b);
v_store(dst0 + i, a); v_store(dst0 + i, a, mode);
v_store(dst1 + i, b); v_store(dst1 + i, b, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
else if( cn == 3 ) else if( cn == 3 )
@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
T* dst2 = dst[2]; T* dst2 = dst[2];
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b, c; VecT a, b, c;
v_load_deinterleave(src + i*cn, a, b, c); v_load_deinterleave(src + i*cn, a, b, c);
v_store(dst0 + i, a); v_store(dst0 + i, a, mode);
v_store(dst1 + i, b); v_store(dst1 + i, b, mode);
v_store(dst2 + i, c); v_store(dst2 + i, c, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
else else
@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
T* dst3 = dst[3]; T* dst3 = dst[3];
for( i = 0; i < len; i += VECSZ ) for( i = 0; i < len; i += VECSZ )
{ {
i = std::min( len - VECSZ, i ); if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b, c, d; VecT a, b, c, d;
v_load_deinterleave(src + i*cn, a, b, c, d); v_load_deinterleave(src + i*cn, a, b, c, d);
v_store(dst0 + i, a); v_store(dst0 + i, a, mode);
v_store(dst1 + i, b); v_store(dst1 + i, b, mode);
v_store(dst2 + i, c); v_store(dst2 + i, c, mode);
v_store(dst3 + i, d); v_store(dst3 + i, d, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
} }
} }
vx_cleanup(); vx_cleanup();

@ -654,6 +654,27 @@ String getHardwareFeatureName(int feature)
return name ? String(name) : String(); return name ? String(name) : String();
} }
std::string getCPUFeaturesLine()
{
const int features[] = { CV_CPU_BASELINE_FEATURES, CV_CPU_DISPATCH_FEATURES };
const int sz = sizeof(features) / sizeof(features[0]);
std::string result;
std::string prefix;
for (int i = 1; i < sz; ++i)
{
if (features[i] == 0)
{
prefix = "*";
continue;
}
if (i != 1) result.append(" ");
result.append(prefix);
result.append(getHWFeatureNameSafe(features[i]));
if (!checkHardwareSupport(features[i])) result.append("?");
}
return result;
}
volatile bool useOptimizedFlag = true; volatile bool useOptimizedFlag = true;
void setUseOptimized( bool flag ) void setUseOptimized( bool flag )

@ -84,14 +84,11 @@ UMatData::~UMatData()
allocatorFlags_ = 0; allocatorFlags_ = 0;
if (originalUMatData) if (originalUMatData)
{ {
UMatData* u = originalUMatData;
CV_XADD(&(u->urefcount), -1);
CV_XADD(&(u->refcount), -1);
bool showWarn = false; bool showWarn = false;
if (u->refcount == 0) UMatData* u = originalUMatData;
bool zero_Ref = CV_XADD(&(u->refcount), -1) == 1;
if (zero_Ref)
{ {
if (u->urefcount > 0)
showWarn = true;
// simulate Mat::deallocate // simulate Mat::deallocate
if (u->mapcount != 0) if (u->mapcount != 0)
{ {
@ -102,7 +99,10 @@ UMatData::~UMatData()
// we don't do "map", so we can't do "unmap" // we don't do "map", so we can't do "unmap"
} }
} }
if (u->refcount == 0 && u->urefcount == 0) // oops, we need to free resources bool zero_URef = CV_XADD(&(u->urefcount), -1) == 1;
if (zero_Ref && !zero_URef)
showWarn = true;
if (zero_Ref && zero_URef) // oops, we need to free resources
{ {
showWarn = true; showWarn = true;
// simulate UMat::deallocate // simulate UMat::deallocate

@ -2008,11 +2008,9 @@ TEST(Subtract, scalarc4_matc4)
TEST(Compare, empty) TEST(Compare, empty)
{ {
cv::Mat temp, dst1, dst2; cv::Mat temp, dst1, dst2;
cv::compare(temp, temp, dst1, cv::CMP_EQ); EXPECT_NO_THROW(cv::compare(temp, temp, dst1, cv::CMP_EQ));
dst2 = temp > 5;
EXPECT_TRUE(dst1.empty()); EXPECT_TRUE(dst1.empty());
EXPECT_TRUE(dst2.empty()); EXPECT_THROW(dst2 = temp > 5, cv::Exception);
} }
TEST(Compare, regression_8999) TEST(Compare, regression_8999)
@ -2020,9 +2018,7 @@ TEST(Compare, regression_8999)
Mat_<double> A(4,1); A << 1, 3, 2, 4; Mat_<double> A(4,1); A << 1, 3, 2, 4;
Mat_<double> B(1,1); B << 2; Mat_<double> B(1,1); B << 2;
Mat C; Mat C;
ASSERT_ANY_THROW({ EXPECT_THROW(cv::compare(A, B, C, CMP_LT), cv::Exception);
cv::compare(A, B, C, CMP_LT);
});
} }

@ -43,106 +43,35 @@
namespace opencv_test { namespace { namespace opencv_test { namespace {
class Core_ConcatenationTest : public cvtest::BaseTest TEST(Core_Concatenation, empty)
{ {
public: const Mat mat0x5(0,5, CV_8U, Scalar::all(1));
Core_ConcatenationTest(bool horizontal, bool firstEmpty, bool secondEmpty); const Mat mat10x5(10,5, CV_8U, Scalar::all(1));
protected: const Mat mat20x5(20,5, CV_8U, Scalar::all(1));
int prepare_test_case( int );
void run_func();
int validate_test_results( int );
Mat mat0x5; const Mat mat5x0(5,0, CV_8U, Scalar::all(1));
Mat mat10x5; const Mat mat5x10(5,10, CV_8U, Scalar::all(1));
Mat mat20x5; const Mat mat5x20(5,20, CV_8U, Scalar::all(1));
Mat mat5x0;
Mat mat5x10;
Mat mat5x20;
Mat result; Mat result;
bool horizontal; cv::hconcat(mat5x0, mat5x0, result);
bool firstEmpty; EXPECT_MAT_N_DIFF(result, mat5x0, 0);
bool secondEmpty; cv::hconcat(mat5x0, mat5x10, result);
EXPECT_MAT_N_DIFF(result, mat5x10, 0);
private: cv::hconcat(mat5x10, mat5x0, result);
static bool areEqual(const Mat& m1, const Mat& m2); EXPECT_MAT_N_DIFF(result, mat5x10, 0);
cv::hconcat(mat5x10, mat5x10, result);
}; EXPECT_MAT_N_DIFF(result, mat5x20, 0);
Core_ConcatenationTest::Core_ConcatenationTest(bool horizontal_, bool firstEmpty_, bool secondEmpty_) cv::vconcat(mat0x5, mat0x5, result);
: horizontal(horizontal_) EXPECT_MAT_N_DIFF(result, mat0x5, 0);
, firstEmpty(firstEmpty_) cv::vconcat(mat0x5, mat10x5, result);
, secondEmpty(secondEmpty_) EXPECT_MAT_N_DIFF(result, mat10x5, 0);
{ cv::vconcat(mat10x5, mat0x5, result);
test_case_count = 1; EXPECT_MAT_N_DIFF(result, mat10x5, 0);
cv::vconcat(mat10x5, mat10x5, result);
mat0x5 = Mat::ones(0,5, CV_8U); EXPECT_MAT_N_DIFF(result, mat20x5, 0);
mat10x5 = Mat::ones(10,5, CV_8U);
mat20x5 = Mat::ones(20,5, CV_8U);
mat5x0 = Mat::ones(5,0, CV_8U);
mat5x10 = Mat::ones(5,10, CV_8U);
mat5x20 = Mat::ones(5,20, CV_8U);
}
int Core_ConcatenationTest::prepare_test_case( int test_case_idx )
{
cvtest::BaseTest::prepare_test_case( test_case_idx );
return 1;
}
void Core_ConcatenationTest::run_func()
{
if (horizontal)
{
cv::hconcat((firstEmpty ? mat5x0 : mat5x10),
(secondEmpty ? mat5x0 : mat5x10),
result);
} else {
cv::vconcat((firstEmpty ? mat0x5 : mat10x5),
(secondEmpty ? mat0x5 : mat10x5),
result);
}
}
int Core_ConcatenationTest::validate_test_results( int )
{
Mat expected;
if (firstEmpty && secondEmpty)
expected = (horizontal ? mat5x0 : mat0x5);
else if ((firstEmpty && !secondEmpty) || (!firstEmpty && secondEmpty))
expected = (horizontal ? mat5x10 : mat10x5);
else
expected = (horizontal ? mat5x20 : mat20x5);
if (areEqual(expected, result))
{
return cvtest::TS::OK;
} else
{
ts->printf( cvtest::TS::LOG, "Concatenation failed");
ts->set_failed_test_info( cvtest::TS::FAIL_MISMATCH );
}
return cvtest::TS::OK;
}
bool Core_ConcatenationTest::areEqual(const Mat &m1, const Mat &m2)
{
return m1.size() == m2.size()
&& m1.type() == m2.type()
&& countNonZero(m1 != m2) == 0;
} }
TEST(Core_Concatenation, hconcat_empty_nonempty) { Core_ConcatenationTest test(true, true, false); test.safe_run(); }
TEST(Core_Concatenation, hconcat_nonempty_empty) { Core_ConcatenationTest test(true, false, true); test.safe_run(); }
TEST(Core_Concatenation, hconcat_empty_empty) { Core_ConcatenationTest test(true, true, true); test.safe_run(); }
TEST(Core_Concatenation, vconcat_empty_nonempty) { Core_ConcatenationTest test(false, true, false); test.safe_run(); }
TEST(Core_Concatenation, vconcat_nonempty_empty) { Core_ConcatenationTest test(false, false, true); test.safe_run(); }
TEST(Core_Concatenation, vconcat_empty_empty) { Core_ConcatenationTest test(false, true, true); test.safe_run(); }
}} // namespace }} // namespace

@ -0,0 +1,5 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "test_precomp.hpp"
#include "test_intrin.simd.hpp"

@ -2,249 +2,101 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory // It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html. // of this distribution and at http://opencv.org/license.html.
#include "test_precomp.hpp" #include "test_precomp.hpp"
#include "test_intrin.simd.hpp"
#include "test_intrin_utils.hpp" #define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
#define CV_CPU_SIMD_FILENAME "test_intrin_utils.hpp"
#define CV_CPU_DISPATCH_MODE FP16 #define CV_CPU_DISPATCH_MODE FP16
#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
#define CV_CPU_DISPATCH_MODE AVX2
using namespace cv; #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
namespace opencv_test { namespace hal { namespace opencv_test { namespace hal {
using namespace CV_CPU_OPTIMIZATION_NAMESPACE; using namespace CV_CPU_OPTIMIZATION_NAMESPACE;
//============= 8-bit integer ===================================================================== TEST(hal_intrin, uint8x16)
{ test_hal_intrin_uint8(); }
TEST(hal_intrin, uint8x16) {
TheTest<v_uint8x16>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
}
TEST(hal_intrin, int8x16) { TEST(hal_intrin, int8x16)
TheTest<v_int8x16>() { test_hal_intrin_int8(); }
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
}
//============= 16-bit integer ===================================================================== TEST(hal_intrin, uint16x8)
{ test_hal_intrin_uint16(); }
TEST(hal_intrin, uint16x8) {
TheTest<v_uint16x8>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
TEST(hal_intrin, int16x8) { TEST(hal_intrin, int16x8)
TheTest<v_int16x8>() { test_hal_intrin_int16(); }
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_dot_prod()
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
//============= 32-bit integer ===================================================================== TEST(hal_intrin, int32x4)
{ test_hal_intrin_int32(); }
TEST(hal_intrin, uint32x4) {
TheTest<v_uint32x4>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_transpose()
;
}
TEST(hal_intrin, int32x4) { TEST(hal_intrin, uint32x4)
TheTest<v_int32x4>() { test_hal_intrin_uint32(); }
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_abs()
.test_cmp()
.test_popcount()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_float_cvt32()
.test_float_cvt64()
.test_transpose()
;
}
//============= 64-bit integer ===================================================================== TEST(hal_intrin, uint64x2)
{ test_hal_intrin_uint64(); }
TEST(hal_intrin, uint64x2) {
TheTest<v_uint64x2>()
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
TEST(hal_intrin, int64x2) { TEST(hal_intrin, int64x2)
TheTest<v_int64x2>() { test_hal_intrin_int64(); }
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
//============= Floating point ===================================================================== TEST(hal_intrin, float32x4)
{ test_hal_intrin_float32(); }
TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>()
.test_loadstore()
.test_interleave()
.test_interleave_2channel()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_reduce()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt64()
.test_matmul()
.test_transpose()
.test_reduce_sum4()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
;
}
#if CV_SIMD128_64F TEST(hal_intrin, float64x2)
TEST(hal_intrin, float64x2) { { test_hal_intrin_float64(); }
TheTest<v_float64x2>()
.test_loadstore()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt32()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
#endif
TEST(hal_intrin,float16) TEST(hal_intrin, float16x8)
{ {
CV_CPU_CALL_FP16_(test_hal_intrin_float16, ()); CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
throw SkipTestException("Unsupported hardware: FP16 is not available"); throw SkipTestException("Unsupported hardware: FP16 is not available");
} }
}} #define DISPATCH_SIMD_MODES AVX2
#define DISPATCH_SIMD_NAME "SIMD256"
#define DISPATCH_SIMD(fun) \
do { \
CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES); \
throw SkipTestException( \
"Unsupported hardware: " \
DISPATCH_SIMD_NAME \
" is not available" \
); \
} while(0)
TEST(hal_intrin256, uint8x32)
{ DISPATCH_SIMD(test_hal_intrin_uint8); }
TEST(hal_intrin256, int8x32)
{ DISPATCH_SIMD(test_hal_intrin_int8); }
TEST(hal_intrin256, uint16x16)
{ DISPATCH_SIMD(test_hal_intrin_uint16); }
TEST(hal_intrin256, int16x16)
{ DISPATCH_SIMD(test_hal_intrin_int16); }
TEST(hal_intrin256, uint32x8)
{ DISPATCH_SIMD(test_hal_intrin_uint32); }
TEST(hal_intrin256, int32x8)
{ DISPATCH_SIMD(test_hal_intrin_int32); }
TEST(hal_intrin256, uint64x4)
{ DISPATCH_SIMD(test_hal_intrin_uint64); }
TEST(hal_intrin256, int64x4)
{ DISPATCH_SIMD(test_hal_intrin_int64); }
TEST(hal_intrin256, float32x8)
{ DISPATCH_SIMD(test_hal_intrin_float32); }
TEST(hal_intrin256, float64x4)
{ DISPATCH_SIMD(test_hal_intrin_float64); }
TEST(hal_intrin256, float16x16)
{
if (!CV_CPU_HAS_SUPPORT_FP16)
throw SkipTestException("Unsupported hardware: FP16 is not available");
DISPATCH_SIMD(test_hal_intrin_float16);
}
}} // namespace

@ -9,7 +9,7 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void test_hal_intrin_float16() void test_hal_intrin_float16()
{ {
TheTest<v_float16x8>() TheTest<v_float16>()
.test_loadstore_fp16() .test_loadstore_fp16()
.test_float_cvt_fp16() .test_float_cvt_fp16()
; ;

@ -0,0 +1,296 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "test_precomp.hpp"
#include "test_intrin_utils.hpp"
namespace opencv_test { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void test_hal_intrin_uint8();
void test_hal_intrin_int8();
void test_hal_intrin_uint16();
void test_hal_intrin_int16();
void test_hal_intrin_uint32();
void test_hal_intrin_int32();
void test_hal_intrin_uint64();
void test_hal_intrin_int64();
void test_hal_intrin_float32();
void test_hal_intrin_float64();
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
//============= 8-bit integer =====================================================================
void test_hal_intrin_uint8()
{
TheTest<v_uint8>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
#if CV_SIMD256
TheTest<v_uint8>()
.test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
.test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
.test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
.test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
;
#endif
}
void test_hal_intrin_int8()
{
TheTest<v_int8>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
}
//============= 16-bit integer =====================================================================
void test_hal_intrin_uint16()
{
TheTest<v_uint16>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
void test_hal_intrin_int16()
{
TheTest<v_int16>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_dot_prod()
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
//============= 32-bit integer =====================================================================
void test_hal_intrin_uint32()
{
TheTest<v_uint32>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_transpose()
;
}
void test_hal_intrin_int32()
{
TheTest<v_int32>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_abs()
.test_cmp()
.test_popcount()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_float_cvt32()
.test_float_cvt64()
.test_transpose()
;
}
//============= 64-bit integer =====================================================================
void test_hal_intrin_uint64()
{
TheTest<v_uint64>()
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
void test_hal_intrin_int64()
{
TheTest<v_int64>()
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
//============= Floating point =====================================================================
void test_hal_intrin_float32()
{
TheTest<v_float32>()
.test_loadstore()
.test_interleave()
.test_interleave_2channel()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_reduce()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt64()
.test_matmul()
.test_transpose()
.test_reduce_sum4()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
;
#if CV_SIMD256
TheTest<v_float32>()
.test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
.test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
;
#endif
}
void test_hal_intrin_float64()
{
#if CV_SIMD_64F
TheTest<v_float64>()
.test_loadstore()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt32()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
#if CV_SIMD256
TheTest<v_float64>()
.test_extract<2>().test_extract<3>()
.test_rotate<2>().test_rotate<3>()
;
#endif //CV_SIMD256
#endif
}
#if CV_FP16 && CV_SIMD_WIDTH > 16
void test_hal_intrin_float16()
{
TheTest<v_float16>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
;
}
#endif
#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} //namespace

@ -13,6 +13,27 @@ void test_hal_intrin_float16();
template <typename R> struct Data; template <typename R> struct Data;
template <int N> struct initializer; template <int N> struct initializer;
template <> struct initializer<64>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[50], d[51], d[52], d[53],
d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]);
}
};
template <> struct initializer<32>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]);
}
};
template <> struct initializer<16> template <> struct initializer<16>
{ {
template <typename R> static R init(const Data<R> & d) template <typename R> static R init(const Data<R> & d)
@ -125,6 +146,17 @@ template <typename R> struct Data
{ {
return d + R::nlanes / 2; return d + R::nlanes / 2;
} }
LaneType sum(int s, int c)
{
LaneType res = 0;
for (int i = s; i < s + c; ++i)
res += d[i];
return res;
}
LaneType sum()
{
return sum(0, R::nlanes);
}
bool operator==(const Data<R> & other) const bool operator==(const Data<R> & other) const
{ {
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; ++i)
@ -147,13 +179,12 @@ template <typename R> struct Data
return false; return false;
return true; return true;
} }
LaneType d[R::nlanes]; LaneType d[R::nlanes];
}; };
template<typename R> struct AlignedData template<typename R> struct AlignedData
{ {
Data<R> CV_DECL_ALIGNED(16) a; // aligned Data<R> CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned
char dummy; char dummy;
Data<R> u; // unaligned Data<R> u; // unaligned
}; };
@ -207,22 +238,22 @@ template<typename R> struct TheTest
AlignedData<R> out; AlignedData<R> out;
// check if addresses are aligned and unaligned respectively // check if addresses are aligned and unaligned respectively
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
// check some initialization methods // check some initialization methods
R r1 = data.a; R r1 = data.a;
R r2 = v_load(data.u.d); R r2 = vx_load(data.u.d);
R r3 = v_load_aligned(data.a.d); R r3 = vx_load_aligned(data.a.d);
R r4(r2); R r4(r2);
EXPECT_EQ(data.a[0], r1.get0()); EXPECT_EQ(data.a[0], r1.get0());
EXPECT_EQ(data.u[0], r2.get0()); EXPECT_EQ(data.u[0], r2.get0());
EXPECT_EQ(data.a[0], r3.get0()); EXPECT_EQ(data.a[0], r3.get0());
EXPECT_EQ(data.u[0], r4.get0()); EXPECT_EQ(data.u[0], r4.get0());
R r_low = v_load_low((LaneType*)data.u.d); R r_low = vx_load_low((LaneType*)data.u.d);
EXPECT_EQ(data.u[0], r_low.get0()); EXPECT_EQ(data.u[0], r_low.get0());
v_store(out.u.d, r_low); v_store(out.u.d, r_low);
for (int i = 0; i < R::nlanes/2; ++i) for (int i = 0; i < R::nlanes/2; ++i)
@ -230,7 +261,7 @@ template<typename R> struct TheTest
EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]); EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
} }
R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8)); R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2)));
EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0()); EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
v_store(out.u.d, r_low_align8byte); v_store(out.u.d, r_low_align8byte);
for (int i = 0; i < R::nlanes/2; ++i) for (int i = 0; i < R::nlanes/2; ++i)
@ -255,7 +286,7 @@ template<typename R> struct TheTest
// check halves load correctness // check halves load correctness
res.clear(); res.clear();
R r6 = v_load_halves(d.d, d.mid()); R r6 = vx_load_halves(d.d, d.mid());
v_store(res.d, r6); v_store(res.d, r6);
EXPECT_EQ(d, res); EXPECT_EQ(d, res);
@ -270,17 +301,17 @@ template<typename R> struct TheTest
} }
// reinterpret_as // reinterpret_as
v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a); v_uint8 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a); v_int8 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a); v_uint16 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a); v_int16 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a); v_uint32 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a); v_int32 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
#if CV_SIMD128_64F #if CV_SIMD_64F
v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
#endif #endif
return *this; return *this;
@ -357,7 +388,7 @@ template<typename R> struct TheTest
Data<R> dataA; Data<R> dataA;
R a = dataA; R a = dataA;
Data<Rx2> resB = v_load_expand(dataA.d); Data<Rx2> resB = vx_load_expand(dataA.d);
Rx2 c, d; Rx2 c, d;
v_expand(a, c, d); v_expand(a, c, d);
@ -378,7 +409,7 @@ template<typename R> struct TheTest
{ {
typedef typename V_RegTraits<R>::q_reg Rx4; typedef typename V_RegTraits<R>::q_reg Rx4;
Data<R> data; Data<R> data;
Data<Rx4> out = v_load_expand_q(data.d); Data<Rx4> out = vx_load_expand_q(data.d);
const int n = Rx4::nlanes; const int n = Rx4::nlanes;
for (int i = 0; i < n; ++i) for (int i = 0; i < n; ++i)
EXPECT_EQ(data[i], out[i]); EXPECT_EQ(data[i], out[i]);
@ -610,7 +641,13 @@ template<typename R> struct TheTest
TheTest & test_popcount() TheTest & test_popcount()
{ {
static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33}; static unsigned popcountTable[] = {
0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33,
35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81,
83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123,
128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172,
176, 181, 186, 192, 193
};
Data<R> dataA; Data<R> dataA;
R a = dataA; R a = dataA;
@ -918,7 +955,7 @@ template<typename R> struct TheTest
TheTest & test_float_cvt32() TheTest & test_float_cvt32()
{ {
typedef v_float32x4 Rt; typedef v_float32 Rt;
Data<R> dataA; Data<R> dataA;
dataA *= 1.1; dataA *= 1.1;
R a = dataA; R a = dataA;
@ -934,8 +971,8 @@ template<typename R> struct TheTest
TheTest & test_float_cvt64() TheTest & test_float_cvt64()
{ {
#if CV_SIMD128_64F #if CV_SIMD_64F
typedef v_float64x2 Rt; typedef v_float64 Rt;
Data<R> dataA; Data<R> dataA;
dataA *= 1.1; dataA *= 1.1;
R a = dataA; R a = dataA;
@ -965,23 +1002,29 @@ template<typename R> struct TheTest
R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD; R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
Data<R> res = v_matmul(v, a, b, c, d); Data<R> res = v_matmul(v, a, b, c, d);
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; i += 4)
{
for (int j = i; j < i + 4; ++j)
{ {
LaneType val = dataV[0] * dataA[i] LaneType val = dataV[i] * dataA[j]
+ dataV[1] * dataB[i] + dataV[i + 1] * dataB[j]
+ dataV[2] * dataC[i] + dataV[i + 2] * dataC[j]
+ dataV[3] * dataD[i]; + dataV[i + 3] * dataD[j];
EXPECT_DOUBLE_EQ(val, res[i]); EXPECT_COMPARE_EQ(val, res[j]);
}
} }
Data<R> resAdd = v_matmuladd(v, a, b, c, d); Data<R> resAdd = v_matmuladd(v, a, b, c, d);
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; i += 4)
{ {
LaneType val = dataV[0] * dataA[i] for (int j = i; j < i + 4; ++j)
+ dataV[1] * dataB[i] {
+ dataV[2] * dataC[i] LaneType val = dataV[i] * dataA[j]
+ dataD[i]; + dataV[i + 1] * dataB[j]
EXPECT_DOUBLE_EQ(val, resAdd[i]); + dataV[i + 2] * dataC[j]
+ dataD[j];
EXPECT_COMPARE_EQ(val, resAdd[j]);
}
} }
return *this; return *this;
} }
@ -998,30 +1041,36 @@ template<typename R> struct TheTest
e, f, g, h); e, f, g, h);
Data<R> res[4] = {e, f, g, h}; Data<R> res[4] = {e, f, g, h};
for (int i = 0; i < R::nlanes; ++i) for (int i = 0; i < R::nlanes; i += 4)
{
for (int j = 0; j < 4; ++j)
{ {
EXPECT_EQ(dataA[i], res[i][0]); EXPECT_EQ(dataA[i + j], res[j][i]);
EXPECT_EQ(dataB[i], res[i][1]); EXPECT_EQ(dataB[i + j], res[j][i + 1]);
EXPECT_EQ(dataC[i], res[i][2]); EXPECT_EQ(dataC[i + j], res[j][i + 2]);
EXPECT_EQ(dataD[i], res[i][3]); EXPECT_EQ(dataD[i + j], res[j][i + 3]);
}
} }
return *this; return *this;
} }
TheTest & test_reduce_sum4() TheTest & test_reduce_sum4()
{ {
R a(0.1f, 0.02f, 0.003f, 0.0004f); Data<R> dataA, dataB, dataC, dataD;
R b(1, 20, 300, 4000); dataB *= 0.01f;
R c(10, 2, 0.3f, 0.04f); dataC *= 0.001f;
R d(1, 2, 3, 4); dataD *= 0.002f;
R sum = v_reduce_sum4(a, b, c, d); R a = dataA, b = dataB, c = dataC, d = dataD;
Data<R> res = v_reduce_sum4(a, b, c, d);
Data<R> res = sum; for (int i = 0; i < R::nlanes; i += 4)
EXPECT_EQ(0.1234f, res[0]); {
EXPECT_EQ(4321.0f, res[1]); EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
EXPECT_EQ(12.34f, res[2]); EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
EXPECT_EQ(10.0f, res[3]); EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]);
}
return *this; return *this;
} }
@ -1032,14 +1081,14 @@ template<typename R> struct TheTest
AlignedData<R> out; AlignedData<R> out;
// check if addresses are aligned and unaligned respectively // check if addresses are aligned and unaligned respectively
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
// check some initialization methods // check some initialization methods
R r1 = data.u; R r1 = data.u;
R r2 = v_load_f16(data.a.d); R r2 = vx_load_f16(data.a.d);
R r3(r2); R r3(r2);
EXPECT_EQ(data.u[0], r1.get0()); EXPECT_EQ(data.u[0], r1.get0());
EXPECT_EQ(data.a[0], r2.get0()); EXPECT_EQ(data.a[0], r2.get0());

@ -173,7 +173,6 @@ void Core_RandTest::run( int )
dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz; dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
Mat aslice = arr[k].colRange(sz, sz + dsz); Mat aslice = arr[k].colRange(sz, sz + dsz);
tested_rng.fill(aslice, dist_type, A, B); tested_rng.fill(aslice, dist_type, A, B);
//printf("%d - %d\n", sz, sz + dsz);
} }
} }

@ -85,12 +85,6 @@ else()
set(sources_options EXCLUDE_OPENCL) set(sources_options EXCLUDE_OPENCL)
endif() endif()
if(HAVE_INF_ENGINE)
add_definitions(-DHAVE_INF_ENGINE=1)
list(APPEND include_dirs ${INF_ENGINE_INCLUDE_DIRS})
list(APPEND libs ${INF_ENGINE_LIBRARIES})
endif()
ocv_module_include_directories(${include_dirs}) ocv_module_include_directories(${include_dirs})
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override") # GCC ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override") # GCC
@ -98,9 +92,9 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override") # Clang ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override") # Clang
endif() endif()
ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs}) ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
ocv_create_module(${libs}) ocv_create_module(${libs} ${INF_ENGINE_TARGET})
ocv_add_samples() ocv_add_samples()
ocv_add_accuracy_tests() ocv_add_accuracy_tests(${INF_ENGINE_TARGET})
ocv_add_perf_tests() ocv_add_perf_tests()
ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF) ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
@ -120,9 +114,3 @@ if(BUILD_PERF_TESTS)
endif() endif()
endif() endif()
endif() endif()
# Test Intel's Inference Engine models
if(HAVE_INF_ENGINE AND TARGET opencv_test_dnn)
ocv_target_include_directories(opencv_test_dnn PRIVATE ${INF_ENGINE_INCLUDE_DIRS})
ocv_target_link_libraries(opencv_test_dnn LINK_PRIVATE ${INF_ENGINE_LIBRARIES})
endif()

@ -201,7 +201,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
* @param[out] outputs allocated output blobs, which will store results of the computation. * @param[out] outputs allocated output blobs, which will store results of the computation.
* @param[out] internals allocated internal blobs * @param[out] internals allocated internal blobs
*/ */
virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) = 0; virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
/** @brief Given the @p input blobs, computes the output @p blobs. /** @brief Given the @p input blobs, computes the output @p blobs.
* @param[in] inputs the input blobs. * @param[in] inputs the input blobs.

@ -44,7 +44,9 @@
#include <opencv2/core.hpp> #include <opencv2/core.hpp>
#include <opencv2/core/types_c.h> #include <opencv2/core/types_c.h>
#include <iostream>
#include <ostream> #include <ostream>
#include <sstream>
namespace cv { namespace cv {
namespace dnn { namespace dnn {
@ -178,13 +180,25 @@ static inline MatShape concat(const MatShape& a, const MatShape& b)
return c; return c;
} }
inline void print(const MatShape& shape, const String& name = "") static inline std::string toString(const MatShape& shape, const String& name = "")
{ {
printf("%s: [", name.c_str()); std::ostringstream ss;
size_t i, n = shape.size(); if (!name.empty())
for( i = 0; i < n; i++ ) ss << name << ' ';
printf(" %d", shape[i]); ss << '[';
printf(" ]\n"); for(size_t i = 0, n = shape.size(); i < n; ++i)
ss << ' ' << shape[i];
ss << " ]";
return ss.str();
}
static inline void print(const MatShape& shape, const String& name = "")
{
std::cout << toString(shape, name) << std::endl;
}
static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
{
out << toString(shape);
return out;
} }
inline int clamp(int ax, int dims) inline int clamp(int ax, int dims)

@ -74,6 +74,10 @@ static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSize
#endif #endif
); );
// Additional checks (slowdowns execution!)
static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
using std::vector; using std::vector;
using std::map; using std::map;
@ -2053,10 +2057,75 @@ struct Net::Impl
{ {
if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)) if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers), std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
layer->forward(umat_inputBlobs,
umat_outputBlobs, umat_outputBlobs,
OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers)); umat_internalBlobs);
if (DNN_CHECK_NAN_INF)
{
bool fail = false;
for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
{
UMat& u = umat_outputBlobs[i];
Mat m;
if (u.depth() == CV_16S) // FP16
convertFp16(u, m);
else
m = u.getMat(ACCESS_READ);
if (!checkRange(m))
{
std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
fail = true;
}
else if (!checkRange(m, true, NULL, -1e6, 1e6))
{
std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
fail = true;
}
}
if (fail)
{
for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
{
UMat& u = umat_inputBlobs[i];
Mat m;
if (u.depth() == CV_16S) // FP16
convertFp16(u, m);
else
m = u.getMat(ACCESS_READ);
std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
}
for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
{
UMat& u = umat_outputBlobs[i];
Mat m;
if (u.depth() == CV_16S) // FP16
convertFp16(u, m);
else
m = u.getMat(ACCESS_READ);
std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
}
for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
{
UMat& u = umat_internalBlobs[i];
Mat m;
if (u.depth() == CV_16S) // FP16
convertFp16(u, m);
else
m = u.getMat(ACCESS_READ);
std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
}
if (DNN_CHECK_NAN_INF_RAISE_ERROR)
CV_Assert(!fail);
}
}
OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs); OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
} }
else else
@ -2069,6 +2138,56 @@ struct Net::Impl
layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
if (DNN_CHECK_NAN_INF)
{
bool fail = false;
for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
{
const Mat& m = ld.outputBlobs[i];
if (!checkRange(m))
{
std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
fail = true;
}
else if (!checkRange(m, true, NULL, -1e6, 1e6))
{
std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
fail = true;
}
}
if (fail)
{
for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
{
const Mat* pM = ld.inputBlobs[i];
if (!pM)
{
std::cout << "INPUT " << i << " is NULL" << std::endl;
continue;
}
const Mat& m = *pM;
std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
}
for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
{
const Mat& m = ld.outputBlobs[i];
std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
}
for (size_t i = 0; i < ld.internals.size(); ++i)
{
const Mat& m = ld.internals[i];
std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
}
if (DNN_CHECK_NAN_INF_RAISE_ERROR)
CV_Assert(!fail);
}
}
for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i) for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
{ {
if (!ld.outputBlobsWrappers[i].empty()) if (!ld.outputBlobsWrappers[i].empty())
@ -3071,6 +3190,14 @@ std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
return outputs; return outputs;
} }
void Layer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
Layer::forward_fallback(inputs, outputs, internals);
}
void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();

@ -196,7 +196,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE virtual bool supportBackend(int backendId) CV_OVERRIDE
{ {
return backendId == DNN_BACKEND_OPENCV || return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !_locPredTransposed; backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized;
} }
bool getMemoryShapes(const std::vector<MatShape> &inputs, bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -411,9 +411,12 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (_bboxesNormalized)
{
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
}
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
} }
@ -916,6 +919,7 @@ public:
ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold); ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold);
ieLayer->params["top_k"] = format("%d", _topK); ieLayer->params["top_k"] = format("%d", _topK);
ieLayer->params["keep_top_k"] = format("%d", _keepTopK); ieLayer->params["keep_top_k"] = format("%d", _keepTopK);
ieLayer->params["eta"] = "1.0";
ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold); ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold);
ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0"; ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType; ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;

@ -135,10 +135,17 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE virtual bool supportBackend(int backendId) CV_OVERRIDE
{ {
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
{
if (preferableTarget == DNN_TARGET_MYRIAD)
return type == MAX || type == AVE;
else
return type != STOCHASTIC;
}
else
return backendId == DNN_BACKEND_OPENCV || return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() && backendId == DNN_BACKEND_HALIDE && haveHalide() &&
(type == MAX || type == AVE && !pad.width && !pad.height) || (type == MAX || type == AVE && !pad.width && !pad.height);
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (type == MAX || type == AVE);
} }
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
@ -192,8 +199,11 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (type == MAX || type == AVE || type == STOCHASTIC)
{
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
}
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
} }
@ -238,22 +248,41 @@ public:
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
InferenceEngine::LayerParams lp; InferenceEngine::LayerParams lp;
lp.name = name; lp.name = name;
lp.type = "Pooling";
lp.precision = InferenceEngine::Precision::FP32; lp.precision = InferenceEngine::Precision::FP32;
std::shared_ptr<InferenceEngine::PoolingLayer> ieLayer(new InferenceEngine::PoolingLayer(lp));
std::shared_ptr<InferenceEngine::CNNLayer> ieLayer;
ieLayer->_kernel_x = kernel.width; if (type == MAX || type == AVE)
ieLayer->_kernel_y = kernel.height; {
ieLayer->_stride_x = stride.width; lp.type = "Pooling";
ieLayer->_stride_y = stride.height; InferenceEngine::PoolingLayer* poolLayer = new InferenceEngine::PoolingLayer(lp);
ieLayer->_padding_x = pad.width; poolLayer->_kernel_x = kernel.width;
ieLayer->_padding_y = pad.height; poolLayer->_kernel_y = kernel.height;
ieLayer->_exclude_pad = type == AVE && padMode == "SAME"; poolLayer->_stride_x = stride.width;
ieLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor"; poolLayer->_stride_y = stride.height;
if (type == MAX) poolLayer->_padding_x = pad.width;
ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::MAX; poolLayer->_padding_y = pad.height;
else if (type == AVE) poolLayer->_exclude_pad = type == AVE && padMode == "SAME";
ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::AVG; poolLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
poolLayer->_type = type == MAX ? InferenceEngine::PoolingLayer::PoolType::MAX :
InferenceEngine::PoolingLayer::PoolType::AVG;
ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(poolLayer);
}
else if (type == ROI)
{
lp.type = "ROIPooling";
ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
ieLayer->params["pooled_w"] = format("%d", pooledSize.width);
ieLayer->params["pooled_h"] = format("%d", pooledSize.height);
ieLayer->params["spatial_scale"] = format("%f", spatialScale);
}
else if (type == PSROI)
{
lp.type = "PSROIPooling";
ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
ieLayer->params["output_dim"] = format("%d", psRoiOutChannels);
ieLayer->params["group_size"] = format("%d", pooledSize.width);
ieLayer->params["spatial_scale"] = format("%f", spatialScale);
}
else else
CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); CV_Error(Error::StsNotImplemented, "Unsupported pooling type");

@ -6,6 +6,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
#include "../precomp.hpp" #include "../precomp.hpp"
#include "layers_common.hpp" #include "layers_common.hpp"
#include "../op_inf_engine.hpp"
namespace cv { namespace dnn { namespace cv { namespace dnn {
@ -16,14 +17,14 @@ public:
{ {
setParamsFrom(params); setParamsFrom(params);
uint32_t featStride = params.get<uint32_t>("feat_stride", 16); featStride = params.get<uint32_t>("feat_stride", 16);
uint32_t baseSize = params.get<uint32_t>("base_size", 16); baseSize = params.get<uint32_t>("base_size", 16);
// uint32_t minSize = params.get<uint32_t>("min_size", 16); // uint32_t minSize = params.get<uint32_t>("min_size", 16);
uint32_t keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000); keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300); keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300);
float nmsThreshold = params.get<float>("nms_thresh", 0.7); nmsThreshold = params.get<float>("nms_thresh", 0.7);
DictValue ratios = params.get("ratio"); ratios = params.get("ratio");
DictValue scales = params.get("scale"); scales = params.get("scale");
{ {
LayerParams lp; LayerParams lp;
@ -83,6 +84,12 @@ public:
} }
} }
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && preferableTarget != DNN_TARGET_MYRIAD;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs, bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs, const int requiredOutputs,
std::vector<MatShape> &outputs, std::vector<MatShape> &outputs,
@ -312,6 +319,38 @@ public:
outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0); outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0);
} }
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
{
#ifdef HAVE_INF_ENGINE
InferenceEngine::LayerParams lp;
lp.name = name;
lp.type = "Proposal";
lp.precision = InferenceEngine::Precision::FP32;
std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
ieLayer->params["base_size"] = format("%d", baseSize);
ieLayer->params["feat_stride"] = format("%d", featStride);
ieLayer->params["min_size"] = "16";
ieLayer->params["nms_thresh"] = format("%f", nmsThreshold);
ieLayer->params["post_nms_topn"] = format("%d", keepTopAfterNMS);
ieLayer->params["pre_nms_topn"] = format("%d", keepTopBeforeNMS);
if (ratios.size())
{
ieLayer->params["ratio"] = format("%f", ratios.get<float>(0));
for (int i = 1; i < ratios.size(); ++i)
ieLayer->params["ratio"] += format(",%f", ratios.get<float>(i));
}
if (scales.size())
{
ieLayer->params["scale"] = format("%f", scales.get<float>(0));
for (int i = 1; i < scales.size(); ++i)
ieLayer->params["scale"] += format(",%f", scales.get<float>(i));
}
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
#endif // HAVE_INF_ENGINE
return Ptr<BackendNode>();
}
private: private:
// A first half of channels are background scores. We need only a second one. // A first half of channels are background scores. We need only a second one.
static Mat getObjectScores(const Mat& m) static Mat getObjectScores(const Mat& m)
@ -342,8 +381,10 @@ private:
Ptr<PermuteLayer> deltasPermute; Ptr<PermuteLayer> deltasPermute;
Ptr<PermuteLayer> scoresPermute; Ptr<PermuteLayer> scoresPermute;
uint32_t keepTopAfterNMS; uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
Mat fakeImageBlob; Mat fakeImageBlob;
float nmsThreshold;
DictValue ratios, scales;
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
UMat umat_fakeImageBlob; UMat umat_fakeImageBlob;
#endif #endif

@ -183,8 +183,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
ocl::Kernel oclk_sto_pool_forward( ocl::Kernel oclk_sto_pool_forward(
kname.c_str(), kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d", " -D STRIDE_W=%d -D STRIDE_H=%d",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_ stride_w_, stride_h_
)); ));

@ -322,12 +322,32 @@ InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noex
return InferenceEngine::StatusCode::OK; return InferenceEngine::StatusCode::OK;
} }
InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(size_t size, InferenceEngine::ResponseDesc *responseDesc) noexcept
{
CV_Error(Error::StsNotImplemented, "");
return InferenceEngine::StatusCode::OK;
}
size_t InfEngineBackendNet::getBatchSize() const noexcept size_t InfEngineBackendNet::getBatchSize() const noexcept
{ {
CV_Error(Error::StsNotImplemented, ""); CV_Error(Error::StsNotImplemented, "");
return 0; return 0;
} }
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
InferenceEngine::StatusCode InfEngineBackendNet::AddExtension(const InferenceEngine::IShapeInferExtensionPtr &extension, InferenceEngine::ResponseDesc *resp) noexcept
{
CV_Error(Error::StsNotImplemented, "");
return InferenceEngine::StatusCode::OK;
}
InferenceEngine::StatusCode InfEngineBackendNet::reshape(const InferenceEngine::ICNNNetwork::InputShapes &inputShapes, InferenceEngine::ResponseDesc *resp) noexcept
{
CV_Error(Error::StsNotImplemented, "");
return InferenceEngine::StatusCode::OK;
}
#endif
void InfEngineBackendNet::init(int targetId) void InfEngineBackendNet::init(int targetId)
{ {
if (inputs.empty()) if (inputs.empty())

@ -9,6 +9,8 @@
#define __OPENCV_DNN_OP_INF_ENGINE_HPP__ #define __OPENCV_DNN_OP_INF_ENGINE_HPP__
#include "opencv2/core/cvdef.h" #include "opencv2/core/cvdef.h"
#include "opencv2/core/cvstd.hpp"
#include "opencv2/dnn.hpp"
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
#if defined(__GNUC__) && __GNUC__ >= 5 #if defined(__GNUC__) && __GNUC__ >= 5
@ -19,6 +21,17 @@
#if defined(__GNUC__) && __GNUC__ >= 5 #if defined(__GNUC__) && __GNUC__ >= 5
//#pragma GCC diagnostic pop //#pragma GCC diagnostic pop
#endif #endif
#define INF_ENGINE_RELEASE_2018R1 2018010000
#define INF_ENGINE_RELEASE_2018R2 2018020000
#ifndef INF_ENGINE_RELEASE
#warning("IE version have not been provided via command-line. Using 2018R2 by default")
#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R2
#endif
#define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
#endif // HAVE_INF_ENGINE #endif // HAVE_INF_ENGINE
namespace cv { namespace dnn { namespace cv { namespace dnn {
@ -86,8 +99,15 @@ public:
virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE; virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;
virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) noexcept;
virtual size_t getBatchSize() const noexcept CV_OVERRIDE; virtual size_t getBatchSize() const noexcept CV_OVERRIDE;
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) noexcept;
virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) noexcept;
#endif
void init(int targetId); void init(int targetId);
void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers); void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers);

@ -104,7 +104,7 @@ __kernel void
#elif defined KERNEL_AVE_POOL #elif defined KERNEL_AVE_POOL
__kernel void TEMPLATE(ave_pool_forward, Dtype)( __kernel void TEMPLATE(ave_pool_forward, Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int nthreads, __global const Dtype* bottom_data,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int pooled_height, const int pooled_width,
__global Dtype* top_data) __global Dtype* top_data)
@ -150,7 +150,7 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
#elif defined KERNEL_STO_POOL #elif defined KERNEL_STO_POOL
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)( __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int nthreads, __global const Dtype* bottom_data,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int pooled_height, const int pooled_width,
__global Dtype* top_data) __global Dtype* top_data)

@ -1293,7 +1293,13 @@ void TFImporter::populateNet(Net dstNet)
if (!next_layers.empty()) if (!next_layers.empty())
{ {
int maximumLayerIdx = next_layers[0].second; int maximumLayerIdx = next_layers[0].second;
ExcludeLayer(net, maximumLayerIdx, 0, false);
CV_Assert(net.node(maximumLayerIdx).input_size() == 2);
// The input from the Mul layer can also be at index 1.
int mulInputIdx = (net.node(maximumLayerIdx).input(0) == name) ? 0 : 1;
ExcludeLayer(net, maximumLayerIdx, mulInputIdx, false);
layers_to_ignore.insert(next_layers[0].first); layers_to_ignore.insert(next_layers[0].first);
layerParams.set("negative_slope", scaleMat.at<float>(0)); layerParams.set("negative_slope", scaleMat.at<float>(0));

@ -938,6 +938,16 @@ struct TorchImporter
layerParams.set("end", DictValue::arrayInt<int*>(&ends[0], 4)); layerParams.set("end", DictValue::arrayInt<int*>(&ends[0], 4));
curModule->modules.push_back(newModule); curModule->modules.push_back(newModule);
} }
else if (nnName == "SpatialUpSamplingNearest")
{
readTorchTable(scalarParams, tensorParams);
CV_Assert(scalarParams.has("scale_factor"));
int scale_factor = scalarParams.get<int>("scale_factor");
newModule->apiType = "Resize";
layerParams.set("interpolation", "nearest");
layerParams.set("zoom_factor", scale_factor);
curModule->modules.push_back(newModule);
}
else else
{ {
// Importer does not know how to map Torch's layer type to an OpenCV's one. // Importer does not know how to map Torch's layer type to an OpenCV's one.

@ -175,7 +175,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0; float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0;
float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0; float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt", processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
inp, "detection_out", "", l1, lInf, 0.25); inp, "detection_out", "", l1, lInf, 0.25);
} }
@ -233,11 +233,8 @@ TEST_P(DNNTestNetwork, opencv_face_detector)
{ {
if (backend == DNN_BACKEND_HALIDE) if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException(""); throw SkipTestException("");
Size inpSize;
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
inpSize = Size(300, 300);
Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false)); Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
Mat inp = blobFromImage(img, 1.0, inpSize, Scalar(104.0, 177.0, 123.0), false, false); Mat inp = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt",
inp, "detection_out"); inp, "detection_out");
} }
@ -249,7 +246,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.008 : 0.0; float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.008 : 0.0;
float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.07 : 0.0; float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0731 : 0.0;
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
inp, "detection_out", "", l1, lInf); inp, "detection_out", "", l1, lInf);
} }

@ -51,6 +51,33 @@ static std::string _tf(TString filename)
return (getOpenCVExtraDir() + "/dnn/") + filename; return (getOpenCVExtraDir() + "/dnn/") + filename;
} }
class Test_Caffe_nets : public DNNTestLayer
{
public:
void testFaster(const std::string& proto, const std::string& model, const Mat& ref,
double scoreDiff = 0.0, double iouDiff = 0.0)
{
checkBackend();
Net net = readNetFromCaffe(findDataFile("dnn/" + proto, false),
findDataFile("dnn/" + model, false));
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
Mat img = imread(findDataFile("dnn/dog416.png", false));
resize(img, img, Size(800, 600));
Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
net.setInput(blob, "data");
net.setInput(imInfo, "im_info");
// Output has shape 1x1xNx7 where N - number of detections.
// An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
Mat out = net.forward();
scoreDiff = scoreDiff ? scoreDiff : default_l1;
iouDiff = iouDiff ? iouDiff : default_lInf;
normAssertDetections(ref, out, ("model name: " + model).c_str(), 0.8, scoreDiff, iouDiff);
}
};
TEST(Test_Caffe, memory_read) TEST(Test_Caffe, memory_read)
{ {
const string proto = findDataFile("dnn/bvlc_googlenet.prototxt", false); const string proto = findDataFile("dnn/bvlc_googlenet.prototxt", false);
@ -344,9 +371,15 @@ TEST(Reproducibility_GoogLeNet_fp16, Accuracy)
} }
// https://github.com/richzhang/colorization // https://github.com/richzhang/colorization
TEST(Reproducibility_Colorization, Accuracy) TEST_P(Test_Caffe_nets, Colorization)
{ {
const float l1 = 3e-5; checkBackend();
if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
const float l1 = 4e-4;
const float lInf = 3e-3; const float lInf = 3e-3;
Mat inp = blobFromNPY(_tf("colorization_inp.npy")); Mat inp = blobFromNPY(_tf("colorization_inp.npy"));
@ -356,7 +389,8 @@ TEST(Reproducibility_Colorization, Accuracy)
const string proto = findDataFile("dnn/colorization_deploy_v2.prototxt", false); const string proto = findDataFile("dnn/colorization_deploy_v2.prototxt", false);
const string model = findDataFile("dnn/colorization_release_v2.caffemodel", false); const string model = findDataFile("dnn/colorization_release_v2.caffemodel", false);
Net net = readNetFromCaffe(proto, model); Net net = readNetFromCaffe(proto, model);
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(backend);
net.setPreferableTarget(target);
net.getLayer(net.getLayerId("class8_ab"))->blobs.push_back(kernel); net.getLayer(net.getLayerId("class8_ab"))->blobs.push_back(kernel);
net.getLayer(net.getLayerId("conv8_313_rh"))->blobs.push_back(Mat(1, 313, CV_32F, 2.606)); net.getLayer(net.getLayerId("conv8_313_rh"))->blobs.push_back(Mat(1, 313, CV_32F, 2.606));
@ -447,39 +481,40 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
) )
); );
TEST(Test_Caffe, FasterRCNN_and_RFCN) TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
{ {
std::string models[] = {"VGG16_faster_rcnn_final.caffemodel", "ZF_faster_rcnn_final.caffemodel", if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
"resnet50_rfcn_final.caffemodel"}; (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
std::string protos[] = {"faster_rcnn_vgg16.prototxt", "faster_rcnn_zf.prototxt", throw SkipTestException("");
"rfcn_pascal_voc_resnet50.prototxt"}; static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
Mat refs[] = {(Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953, 0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166), 0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
(Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395, testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref);
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762, }
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176),
(Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16)};
for (int i = 0; i < 3; ++i)
{
std::string proto = findDataFile("dnn/" + protos[i], false);
std::string model = findDataFile("dnn/" + models[i], false);
Net net = readNetFromCaffe(proto, model); TEST_P(Test_Caffe_nets, FasterRCNN_zf)
net.setPreferableBackend(DNN_BACKEND_OPENCV); {
Mat img = imread(findDataFile("dnn/dog416.png", false)); if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
resize(img, img, Size(800, 600)); (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false); (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f); throw SkipTestException("");
static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref);
}
net.setInput(blob, "data"); TEST_P(Test_Caffe_nets, RFCN)
net.setInput(imInfo, "im_info"); {
// Output has shape 1x1xNx7 where N - number of detections. if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
// An every detection is a vector of values [id, classId, confidence, left, top, right, bottom] (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
Mat out = net.forward(); (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
normAssertDetections(refs[i], out, ("model name: " + models[i]).c_str(), 0.8); throw SkipTestException("");
} static Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
testFaster("rfcn_pascal_voc_resnet50.prototxt", "resnet50_rfcn_final.caffemodel", ref);
} }
INSTANTIATE_TEST_CASE_P(/**/, Test_Caffe_nets, dnnBackendsAndTargets());
}} // namespace }} // namespace

@ -16,7 +16,7 @@ using namespace cv;
using namespace cv::dnn; using namespace cv::dnn;
using namespace testing; using namespace testing;
static void test(Mat& input, Net& net, int backendId, int targetId) static void test(Mat& input, Net& net, Backend backendId, Target targetId, bool skipCheck = false)
{ {
DNNTestLayer::checkBackend(backendId, targetId); DNNTestLayer::checkBackend(backendId, targetId);
randu(input, -1.0f, 1.0f); randu(input, -1.0f, 1.0f);
@ -29,16 +29,19 @@ static void test(Mat& input, Net& net, int backendId, int targetId)
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
Mat outputHalide = net.forward().clone(); Mat outputHalide = net.forward().clone();
if (skipCheck)
return;
double l1, lInf; double l1, lInf;
DNNTestLayer::getDefaultThresholds(backendId, targetId, &l1, &lInf); DNNTestLayer::getDefaultThresholds(backendId, targetId, &l1, &lInf);
normAssert(outputDefault, outputHalide, "", l1, lInf); normAssert(outputDefault, outputHalide, "", l1, lInf);
} }
static void test(LayerParams& params, Mat& input, int backendId, int targetId) static void test(LayerParams& params, Mat& input, Backend backendId, Target targetId, bool skipCheck = false)
{ {
Net net; Net net;
net.addLayerToPrev(params.name, params.type, params); net.addLayerToPrev(params.name, params.type, params);
test(input, net, backendId, targetId); test(input, net, backendId, targetId, skipCheck);
} }
static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide() static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
@ -101,16 +104,17 @@ TEST_P(Convolution, Accuracy)
Size pad = get<4>(GetParam()); Size pad = get<4>(GetParam());
Size dilation = get<5>(GetParam()); Size dilation = get<5>(GetParam());
bool hasBias = get<6>(GetParam()); bool hasBias = get<6>(GetParam());
int backendId = get<0>(get<7>(GetParam())); Backend backendId = get<0>(get<7>(GetParam()));
int targetId = get<1>(get<7>(GetParam())); Target targetId = get<1>(get<7>(GetParam()));
if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
throw SkipTestException(""); throw SkipTestException("");
bool skipCheck = false;
if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV && if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV &&
(targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1)) kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1))
throw SkipTestException("Skip unstable test"); skipCheck = true;
int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width}; int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
Mat weights(4, &sz[0], CV_32F); Mat weights(4, &sz[0], CV_32F);
@ -139,7 +143,9 @@ TEST_P(Convolution, Accuracy)
} }
int inpSz[] = {1, inChannels, inSize.height, inSize.width}; int inpSz[] = {1, inChannels, inSize.height, inSize.width};
Mat input(4, &inpSz[0], CV_32F); Mat input(4, &inpSz[0], CV_32F);
test(lp, input, backendId, targetId); test(lp, input, backendId, targetId, skipCheck);
if (skipCheck)
throw SkipTestException("Skip checks in unstable test");
} }
INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine( INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine(
@ -171,8 +177,8 @@ TEST_P(Deconvolution, Accuracy)
Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]); Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]);
Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]); Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]);
bool hasBias = get<6>(GetParam()); bool hasBias = get<6>(GetParam());
int backendId = get<0>(get<7>(GetParam())); Backend backendId = get<0>(get<7>(GetParam()));
int targetId = get<1>(get<7>(GetParam())); Target targetId = get<1>(get<7>(GetParam()));
if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU && if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU &&
dilation.width == 2 && dilation.height == 2) dilation.width == 2 && dilation.height == 2)
throw SkipTestException(""); throw SkipTestException("");
@ -235,8 +241,8 @@ TEST_P(LRN, Accuracy)
float bias = get<2>(GetParam())[2]; float bias = get<2>(GetParam())[2];
bool normBySize = get<3>(GetParam()); bool normBySize = get<3>(GetParam());
std::string nrmType = get<4>(GetParam()); std::string nrmType = get<4>(GetParam());
int backendId = get<0>(get<5>(GetParam())); Backend backendId = get<0>(get<5>(GetParam()));
int targetId = get<1>(get<5>(GetParam())); Target targetId = get<1>(get<5>(GetParam()));
if (backendId == DNN_BACKEND_INFERENCE_ENGINE) if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
throw SkipTestException(""); throw SkipTestException("");
@ -276,8 +282,8 @@ TEST_P(AvePooling, Accuracy)
Size outSize = get<1>(GetParam());; // Input size will be computed from parameters. Size outSize = get<1>(GetParam());; // Input size will be computed from parameters.
Size kernel = get<2>(GetParam()); Size kernel = get<2>(GetParam());
Size stride = get<3>(GetParam()); Size stride = get<3>(GetParam());
int backendId = get<0>(get<4>(GetParam())); Backend backendId = get<0>(get<4>(GetParam()));
int targetId = get<1>(get<4>(GetParam())); Target targetId = get<1>(get<4>(GetParam()));
if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
throw SkipTestException(""); throw SkipTestException("");
@ -317,8 +323,8 @@ TEST_P(MaxPooling, Accuracy)
Size kernel = get<2>(GetParam()); Size kernel = get<2>(GetParam());
Size stride = get<3>(GetParam()); Size stride = get<3>(GetParam());
Size pad = get<4>(GetParam()); Size pad = get<4>(GetParam());
int backendId = get<0>(get<5>(GetParam())); Backend backendId = get<0>(get<5>(GetParam()));
int targetId = get<1>(get<5>(GetParam())); Target targetId = get<1>(get<5>(GetParam()));
LayerParams lp; LayerParams lp;
lp.set("pool", "max"); lp.set("pool", "max");
@ -355,8 +361,8 @@ TEST_P(FullyConnected, Accuracy)
Size inSize = get<1>(GetParam()); Size inSize = get<1>(GetParam());
int outChannels = get<2>(GetParam()); int outChannels = get<2>(GetParam());
bool hasBias = get<3>(GetParam()); bool hasBias = get<3>(GetParam());
int backendId = get<0>(get<4>(GetParam())); Backend backendId = get<0>(get<4>(GetParam()));
int targetId = get<1>(get<4>(GetParam())); Target targetId = get<1>(get<4>(GetParam()));
if (backendId == DNN_BACKEND_INFERENCE_ENGINE) if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
throw SkipTestException(""); throw SkipTestException("");
@ -394,8 +400,8 @@ typedef TestWithParam<tuple<int, tuple<Backend, Target> > > SoftMax;
TEST_P(SoftMax, Accuracy) TEST_P(SoftMax, Accuracy)
{ {
int inChannels = get<0>(GetParam()); int inChannels = get<0>(GetParam());
int backendId = get<0>(get<1>(GetParam())); Backend backendId = get<0>(get<1>(GetParam()));
int targetId = get<1>(get<1>(GetParam())); Target targetId = get<1>(get<1>(GetParam()));
LayerParams lp; LayerParams lp;
lp.type = "SoftMax"; lp.type = "SoftMax";
lp.name = "testLayer"; lp.name = "testLayer";
@ -457,7 +463,7 @@ TEST_P(Test_Halide_layers, MaxPoolUnpool)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static const int kNumChannels = 3; static const int kNumChannels = 3;
void testInPlaceActivation(LayerParams& lp, int backendId, int targetId) void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId)
{ {
EXPECT_FALSE(lp.name.empty()); EXPECT_FALSE(lp.name.empty());
@ -485,8 +491,8 @@ TEST_P(BatchNorm, Accuracy)
bool hasWeights = get<0>(GetParam()); bool hasWeights = get<0>(GetParam());
bool hasBias = get<1>(GetParam()); bool hasBias = get<1>(GetParam());
float epsilon = get<2>(GetParam()); float epsilon = get<2>(GetParam());
int backendId = get<0>(get<3>(GetParam())); Backend backendId = get<0>(get<3>(GetParam()));
int targetId = get<1>(get<3>(GetParam())); Target targetId = get<1>(get<3>(GetParam()));
LayerParams lp; LayerParams lp;
lp.set("has_weight", hasWeights); lp.set("has_weight", hasWeights);
@ -518,8 +524,8 @@ typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
TEST_P(ReLU, Accuracy) TEST_P(ReLU, Accuracy)
{ {
float negativeSlope = get<0>(GetParam()); float negativeSlope = get<0>(GetParam());
int backendId = get<0>(get<1>(GetParam())); Backend backendId = get<0>(get<1>(GetParam()));
int targetId = get<1>(get<1>(GetParam())); Target targetId = get<1>(get<1>(GetParam()));
LayerParams lp; LayerParams lp;
lp.set("negative_slope", negativeSlope); lp.set("negative_slope", negativeSlope);
@ -536,8 +542,8 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine(
typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation; typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
TEST_P(NoParamActivation, Accuracy) TEST_P(NoParamActivation, Accuracy)
{ {
int backendId = get<0>(get<1>(GetParam())); Backend backendId = get<0>(get<1>(GetParam()));
int targetId = get<1>(get<1>(GetParam())); Target targetId = get<1>(get<1>(GetParam()));
LayerParams lp; LayerParams lp;
lp.type = get<0>(GetParam()); lp.type = get<0>(GetParam());
@ -555,8 +561,8 @@ TEST_P(Power, Accuracy)
float power = get<0>(GetParam())[0]; float power = get<0>(GetParam())[0];
float scale = get<0>(GetParam())[1]; float scale = get<0>(GetParam())[1];
float shift = get<0>(GetParam())[2]; float shift = get<0>(GetParam())[2];
int backendId = get<0>(get<1>(GetParam())); Backend backendId = get<0>(get<1>(GetParam()));
int targetId = get<1>(get<1>(GetParam())); Target targetId = get<1>(get<1>(GetParam()));
LayerParams lp; LayerParams lp;
lp.set("power", power); lp.set("power", power);
@ -589,8 +595,8 @@ typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
TEST_P(Scale, Accuracy) TEST_P(Scale, Accuracy)
{ {
bool hasBias = get<0>(GetParam()); bool hasBias = get<0>(GetParam());
int backendId = get<0>(get<1>(GetParam())); Backend backendId = get<0>(get<1>(GetParam()));
int targetId = get<1>(get<1>(GetParam())); Target targetId = get<1>(get<1>(GetParam()));
LayerParams lp; LayerParams lp;
lp.set("bias_term", hasBias); lp.set("bias_term", hasBias);
@ -624,8 +630,8 @@ TEST_P(Concat, Accuracy)
{ {
Vec3i inSize = get<0>(GetParam()); Vec3i inSize = get<0>(GetParam());
Vec3i numChannels = get<1>(GetParam()); Vec3i numChannels = get<1>(GetParam());
int backendId = get<0>(get<2>(GetParam())); Backend backendId = get<0>(get<2>(GetParam()));
int targetId = get<1>(get<2>(GetParam())); Target targetId = get<1>(get<2>(GetParam()));
Net net; Net net;
@ -692,8 +698,8 @@ TEST_P(Eltwise, Accuracy)
std::string op = get<1>(GetParam()); std::string op = get<1>(GetParam());
int numConv = get<2>(GetParam()); int numConv = get<2>(GetParam());
bool weighted = get<3>(GetParam()); bool weighted = get<3>(GetParam());
int backendId = get<0>(get<4>(GetParam())); Backend backendId = get<0>(get<4>(GetParam()));
int targetId = get<1>(get<4>(GetParam())); Target targetId = get<1>(get<4>(GetParam()));
Net net; Net net;

@ -1205,14 +1205,6 @@ public:
} }
} }
void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
Layer::forward_fallback(inputs, outputs, internals);
}
private: private:
int outWidth, outHeight, zoomFactor; int outWidth, outHeight, zoomFactor;
}; };
@ -1225,7 +1217,7 @@ TEST_P(Test_Caffe_layers, DISABLED_Interp) // requires patched protobuf (availa
{ {
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException(""); throw SkipTestException("");
// Test a cusom layer. // Test a custom layer.
CV_DNN_REGISTER_LAYER_CLASS(Interp, CustomInterpLayer); CV_DNN_REGISTER_LAYER_CLASS(Interp, CustomInterpLayer);
try try
{ {

@ -230,6 +230,13 @@ TEST_P(Test_TensorFlow_layers, flatten)
runTensorFlowNet("unfused_flatten_unknown_batch"); runTensorFlowNet("unfused_flatten_unknown_batch");
} }
TEST_P(Test_TensorFlow_layers, leaky_relu)
{
runTensorFlowNet("leaky_relu_order1");
runTensorFlowNet("leaky_relu_order2");
runTensorFlowNet("leaky_relu_order3");
}
TEST_P(Test_TensorFlow_layers, l2_normalize) TEST_P(Test_TensorFlow_layers, l2_normalize)
{ {
runTensorFlowNet("l2_normalize"); runTensorFlowNet("l2_normalize");

@ -69,100 +69,119 @@ TEST(Torch_Importer, simple_read)
ASSERT_FALSE(net.empty()); ASSERT_FALSE(net.empty());
} }
static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "", class Test_Torch_layers : public DNNTestLayer
bool check2ndBlob = false, bool isBinary = false)
{ {
public:
void runTorchNet(const String& prefix, String outLayerName = "",
bool check2ndBlob = false, bool isBinary = false,
double l1 = 0.0, double lInf = 0.0)
{
String suffix = (isBinary) ? ".dat" : ".txt"; String suffix = (isBinary) ? ".dat" : ".txt";
Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
ASSERT_FALSE(net.empty());
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(targetId);
Mat inp, outRef; Mat inp, outRef;
ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) ); ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) ); ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
checkBackend(backend, target, &inp, &outRef);
Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
ASSERT_FALSE(net.empty());
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
if (outLayerName.empty()) if (outLayerName.empty())
outLayerName = net.getLayerNames().back(); outLayerName = net.getLayerNames().back();
net.setInput(inp); net.setInput(inp);
std::vector<Mat> outBlobs; std::vector<Mat> outBlobs;
net.forward(outBlobs, outLayerName); net.forward(outBlobs, outLayerName);
normAssert(outRef, outBlobs[0]); l1 = l1 ? l1 : default_l1;
lInf = lInf ? lInf : default_lInf;
normAssert(outRef, outBlobs[0], "", l1, lInf);
if (check2ndBlob) if (check2ndBlob && backend != DNN_BACKEND_INFERENCE_ENGINE)
{ {
Mat out2 = outBlobs[1]; Mat out2 = outBlobs[1];
Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary); Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
normAssert(out2, ref2); normAssert(out2, ref2, "", l1, lInf);
} }
} }
};
typedef testing::TestWithParam<Target> Test_Torch_layers;
TEST_P(Test_Torch_layers, run_convolution) TEST_P(Test_Torch_layers, run_convolution)
{ {
runTorchNet("net_conv", GetParam(), "", false, true); if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
runTorchNet("net_conv", "", false, true);
} }
TEST_P(Test_Torch_layers, run_pool_max) TEST_P(Test_Torch_layers, run_pool_max)
{ {
runTorchNet("net_pool_max", GetParam(), "", true); if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
throw SkipTestException("");
runTorchNet("net_pool_max", "", true);
} }
TEST_P(Test_Torch_layers, run_pool_ave) TEST_P(Test_Torch_layers, run_pool_ave)
{ {
runTorchNet("net_pool_ave", GetParam()); runTorchNet("net_pool_ave");
} }
TEST_P(Test_Torch_layers, run_reshape) TEST_P(Test_Torch_layers, run_reshape)
{ {
int targetId = GetParam(); runTorchNet("net_reshape");
runTorchNet("net_reshape", targetId); runTorchNet("net_reshape_batch");
runTorchNet("net_reshape_batch", targetId); runTorchNet("net_reshape_channels", "", false, true);
runTorchNet("net_reshape_single_sample", targetId); }
runTorchNet("net_reshape_channels", targetId, "", false, true);
TEST_P(Test_Torch_layers, run_reshape_single_sample)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
throw SkipTestException("");
runTorchNet("net_reshape_single_sample", "", false, false,
(target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.0052 : 0.0);
} }
TEST_P(Test_Torch_layers, run_linear) TEST_P(Test_Torch_layers, run_linear)
{ {
runTorchNet("net_linear_2d", GetParam()); if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
throw SkipTestException("");
runTorchNet("net_linear_2d");
} }
TEST_P(Test_Torch_layers, run_concat) TEST_P(Test_Torch_layers, run_concat)
{ {
int targetId = GetParam(); runTorchNet("net_concat", "l5_torchMerge");
runTorchNet("net_concat", targetId, "l5_torchMerge"); runTorchNet("net_depth_concat", "", false, true, 0.0,
runTorchNet("net_depth_concat", targetId, "", false, true); target == DNN_TARGET_OPENCL_FP16 ? 0.021 : 0.0);
} }
TEST_P(Test_Torch_layers, run_deconv) TEST_P(Test_Torch_layers, run_deconv)
{ {
runTorchNet("net_deconv", GetParam()); runTorchNet("net_deconv");
} }
TEST_P(Test_Torch_layers, run_batch_norm) TEST_P(Test_Torch_layers, run_batch_norm)
{ {
runTorchNet("net_batch_norm", GetParam(), "", false, true); runTorchNet("net_batch_norm", "", false, true);
} }
TEST_P(Test_Torch_layers, net_prelu) TEST_P(Test_Torch_layers, net_prelu)
{ {
runTorchNet("net_prelu", GetParam()); runTorchNet("net_prelu");
} }
TEST_P(Test_Torch_layers, net_cadd_table) TEST_P(Test_Torch_layers, net_cadd_table)
{ {
runTorchNet("net_cadd_table", GetParam()); runTorchNet("net_cadd_table");
} }
TEST_P(Test_Torch_layers, net_softmax) TEST_P(Test_Torch_layers, net_softmax)
{ {
int targetId = GetParam(); runTorchNet("net_softmax");
runTorchNet("net_softmax", targetId); runTorchNet("net_softmax_spatial");
runTorchNet("net_softmax_spatial", targetId);
} }
TEST_P(Test_Torch_layers, net_logsoftmax) TEST_P(Test_Torch_layers, net_logsoftmax)
@ -173,40 +192,55 @@ TEST_P(Test_Torch_layers, net_logsoftmax)
TEST_P(Test_Torch_layers, net_lp_pooling) TEST_P(Test_Torch_layers, net_lp_pooling)
{ {
int targetId = GetParam(); runTorchNet("net_lp_pooling_square", "", false, true);
runTorchNet("net_lp_pooling_square", targetId, "", false, true); runTorchNet("net_lp_pooling_power", "", false, true);
runTorchNet("net_lp_pooling_power", targetId, "", false, true);
} }
TEST_P(Test_Torch_layers, net_conv_gemm_lrn) TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
{ {
runTorchNet("net_conv_gemm_lrn", GetParam(), "", false, true); if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
runTorchNet("net_conv_gemm_lrn", "", false, true,
target == DNN_TARGET_OPENCL_FP16 ? 0.046 : 0.0,
target == DNN_TARGET_OPENCL_FP16 ? 0.023 : 0.0);
} }
TEST_P(Test_Torch_layers, net_inception_block) TEST_P(Test_Torch_layers, net_inception_block)
{ {
runTorchNet("net_inception_block", GetParam(), "", false, true); runTorchNet("net_inception_block", "", false, true);
} }
TEST_P(Test_Torch_layers, net_normalize) TEST_P(Test_Torch_layers, net_normalize)
{ {
runTorchNet("net_normalize", GetParam(), "", false, true); runTorchNet("net_normalize", "", false, true);
} }
TEST_P(Test_Torch_layers, net_padding) TEST_P(Test_Torch_layers, net_padding)
{ {
int targetId = GetParam(); runTorchNet("net_padding", "", false, true);
runTorchNet("net_padding", targetId, "", false, true); runTorchNet("net_spatial_zero_padding", "", false, true);
runTorchNet("net_spatial_zero_padding", targetId, "", false, true); runTorchNet("net_spatial_reflection_padding", "", false, true);
runTorchNet("net_spatial_reflection_padding", targetId, "", false, true);
} }
TEST_P(Test_Torch_layers, net_non_spatial) TEST_P(Test_Torch_layers, net_non_spatial)
{ {
runTorchNet("net_non_spatial", GetParam(), "", false, true); if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
(target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
runTorchNet("net_non_spatial", "", false, true);
}
TEST_P(Test_Torch_layers, run_paralel)
{
if (backend != DNN_BACKEND_OPENCV || target != DNN_TARGET_CPU)
throw SkipTestException("");
runTorchNet("net_parallel", "l5_torchMerge");
} }
INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, availableDnnTargets()); TEST_P(Test_Torch_layers, net_residual)
{
runTorchNet("net_residual", "", false, true);
}
typedef testing::TestWithParam<Target> Test_Torch_nets; typedef testing::TestWithParam<Target> Test_Torch_nets;
@ -313,21 +347,6 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_nets, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_nets, availableDnnTargets());
// TODO: fix OpenCL and add to the rest of tests
TEST(Torch_Importer, run_paralel)
{
runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge");
}
TEST(Torch_Importer, DISABLED_run_paralel)
{
runTorchNet("net_parallel", DNN_TARGET_OPENCL, "l5_torchMerge");
}
TEST(Torch_Importer, net_residual)
{
runTorchNet("net_residual", DNN_TARGET_CPU, "", false, true);
}
// Test a custom layer // Test a custom layer
// https://github.com/torch/nn/blob/master/doc/convolution.md#nn.SpatialUpSamplingNearest // https://github.com/torch/nn/blob/master/doc/convolution.md#nn.SpatialUpSamplingNearest
@ -374,17 +393,29 @@ public:
} }
} }
virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE {}
private: private:
int scale; int scale;
}; };
TEST(Torch_Importer, upsampling_nearest) TEST_P(Test_Torch_layers, upsampling_nearest)
{ {
// Test a custom layer.
CV_DNN_REGISTER_LAYER_CLASS(SpatialUpSamplingNearest, SpatialUpSamplingNearestLayer); CV_DNN_REGISTER_LAYER_CLASS(SpatialUpSamplingNearest, SpatialUpSamplingNearestLayer);
runTorchNet("net_spatial_upsampling_nearest", DNN_TARGET_CPU, "", false, true); try
{
runTorchNet("net_spatial_upsampling_nearest", "", false, true);
}
catch (...)
{
LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
throw;
}
LayerFactory::unregisterLayer("SpatialUpSamplingNearest"); LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
// Test an implemented layer.
runTorchNet("net_spatial_upsampling_nearest", "", false, true);
} }
INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, dnnBackendsAndTargets());
} }

@ -307,8 +307,8 @@ icvLoadWindowPos( const char* name, CvRect& rect )
{ {
HKEY hkey; HKEY hkey;
char szKey[1024]; char szKey[1024];
strcpy( szKey, icvWindowPosRootKey ); strcpy_s( szKey, 1024, icvWindowPosRootKey );
strcat( szKey, name ); strcat_s( szKey, 1024, name );
rect.x = rect.y = CW_USEDEFAULT; rect.x = rect.y = CW_USEDEFAULT;
rect.width = rect.height = 320; rect.width = rect.height = 320;
@ -368,8 +368,8 @@ icvSaveWindowPos( const char* name, CvRect rect )
HKEY hkey; HKEY hkey;
char szKey[1024]; char szKey[1024];
char rootKey[1024]; char rootKey[1024];
strcpy( szKey, icvWindowPosRootKey ); strcpy_s( szKey, 1024, icvWindowPosRootKey );
strcat( szKey, name ); strcat_s( szKey, 1024, name );
if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS ) if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS )
{ {
@ -379,7 +379,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
char oldestKey[1024]; char oldestKey[1024];
char currentKey[1024]; char currentKey[1024];
strcpy( rootKey, icvWindowPosRootKey ); strcpy_s( rootKey, 1024, icvWindowPosRootKey );
rootKey[strlen(rootKey)-1] = '\0'; rootKey[strlen(rootKey)-1] = '\0';
if( RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS ) if( RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS )
//RegOpenKeyEx( HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS ) //RegOpenKeyEx( HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS )
@ -398,7 +398,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
oldestTime.dwLowDateTime > accesstime.dwLowDateTime) ) oldestTime.dwLowDateTime > accesstime.dwLowDateTime) )
{ {
oldestTime = accesstime; oldestTime = accesstime;
strcpy( oldestKey, currentKey ); strcpy_s( oldestKey, 1024, currentKey );
} }
} }
@ -1500,6 +1500,8 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
rgn = CreateRectRgn(0, 0, wrc.right, wrc.bottom); rgn = CreateRectRgn(0, 0, wrc.right, wrc.bottom);
rgn1 = CreateRectRgn(cr.left, cr.top, cr.right, cr.bottom); rgn1 = CreateRectRgn(cr.left, cr.top, cr.right, cr.bottom);
rgn2 = CreateRectRgn(tr.left, tr.top, tr.right, tr.bottom); rgn2 = CreateRectRgn(tr.left, tr.top, tr.right, tr.bottom);
CV_Assert(rgn != 0, rgn1 != 0, rgn2 != 0);
ret = CombineRgn(rgn, rgn, rgn1, RGN_DIFF); ret = CombineRgn(rgn, rgn, rgn1, RGN_DIFF);
ret = CombineRgn(rgn, rgn, rgn2, RGN_DIFF); ret = CombineRgn(rgn, rgn, rgn2, RGN_DIFF);

@ -1771,7 +1771,7 @@ Corners in the image can be found as the local maxima of this response map.
size as src . size as src .
@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ). @param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
@param ksize Aperture parameter for the Sobel operator. @param ksize Aperture parameter for the Sobel operator.
@param k Harris detector free parameter. See the formula below. @param k Harris detector free parameter. See the formula above.
@param borderType Pixel extrapolation method. See #BorderTypes. @param borderType Pixel extrapolation method. See #BorderTypes.
*/ */
CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize, CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize,

@ -20,8 +20,12 @@ endforeach()
set(opencv_hdrs "") set(opencv_hdrs "")
set(opencv_userdef_hdrs "") set(opencv_userdef_hdrs "")
foreach(m ${OPENCV_PYTHON_MODULES}) foreach(m ${OPENCV_PYTHON_MODULES})
ocv_list_filter(OPENCV_MODULE_${m}_HEADERS "${OPENCV_MODULE_${m}_LOCATION}/include" __hdrs) foreach (hdr ${OPENCV_MODULE_${m}_HEADERS})
list(APPEND opencv_hdrs ${__hdrs}) ocv_is_subdir(is_sub "${OPENCV_MODULE_${m}_LOCATION}/include" "${hdr}")
if(is_sub)
list(APPEND opencv_hdrs "${hdr}")
endif()
endforeach()
file(GLOB userdef_hdrs ${OPENCV_MODULE_${m}_LOCATION}/misc/python/pyopencv*.hpp) file(GLOB userdef_hdrs ${OPENCV_MODULE_${m}_LOCATION}/misc/python/pyopencv*.hpp)
list(APPEND opencv_userdef_hdrs ${userdef_hdrs}) list(APPEND opencv_userdef_hdrs ${userdef_hdrs})
endforeach(m) endforeach(m)

@ -379,10 +379,9 @@ struct TSParams
class TS class TS
{ {
public:
// constructor(s) and destructor
TS(); TS();
virtual ~TS(); virtual ~TS();
public:
enum enum
{ {
@ -484,9 +483,6 @@ public:
SKIPPED=1 SKIPPED=1
}; };
// get file storage
CvFileStorage* get_file_storage();
// get RNG to generate random input data for a test // get RNG to generate random input data for a test
RNG& get_rng() { return rng; } RNG& get_rng() { return rng; }
@ -629,9 +625,6 @@ struct DefaultRngAuto
void fillGradient(Mat& img, int delta = 5); void fillGradient(Mat& img, int delta = 5);
void smoothBorder(Mat& img, const Scalar& color, int delta = 3); void smoothBorder(Mat& img, const Scalar& color, int delta = 3);
void printVersionInfo(bool useStdOut = true);
// Utility functions // Utility functions
void addDataSearchPath(const std::string& path); void addDataSearchPath(const std::string& path);
@ -660,6 +653,13 @@ std::string findDataFile(const std::string& relative_path, bool required = true)
*/ */
std::string findDataDirectory(const std::string& relative_path, bool required = true); std::string findDataDirectory(const std::string& relative_path, bool required = true);
// Test definitions
class SystemInfoCollector : public testing::EmptyTestEventListener
{
private:
virtual void OnTestProgramStart(const testing::UnitTest&);
};
#ifndef __CV_TEST_EXEC_ARGS #ifndef __CV_TEST_EXEC_ARGS
#if defined(_MSC_VER) && (_MSC_VER <= 1400) #if defined(_MSC_VER) && (_MSC_VER <= 1400)
@ -671,15 +671,6 @@ std::string findDataDirectory(const std::string& relative_path, bool required =
#endif #endif
#endif #endif
#ifdef HAVE_OPENCL
namespace ocl {
void dumpOpenCLDevice();
}
#define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
#else
#define TEST_DUMP_OCL_INFO
#endif
void parseCustomOptions(int argc, char **argv); void parseCustomOptions(int argc, char **argv);
#define CV_TEST_INIT0_NOOP (void)0 #define CV_TEST_INIT0_NOOP (void)0
@ -696,8 +687,7 @@ int main(int argc, char **argv) \
ts->init(resourcesubdir); \ ts->init(resourcesubdir); \
__CV_TEST_EXEC_ARGS(CV_TEST_INIT0_ ## INIT0) \ __CV_TEST_EXEC_ARGS(CV_TEST_INIT0_ ## INIT0) \
::testing::InitGoogleTest(&argc, argv); \ ::testing::InitGoogleTest(&argc, argv); \
cvtest::printVersionInfo(); \ ::testing::UnitTest::GetInstance()->listeners().Append(new SystemInfoCollector); \
TEST_DUMP_OCL_INFO \
__CV_TEST_EXEC_ARGS(__VA_ARGS__) \ __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
parseCustomOptions(argc, argv); \ parseCustomOptions(argc, argv); \
} \ } \

@ -637,15 +637,6 @@ void PrintTo(const Size& sz, ::std::ostream* os);
#endif #endif
#endif #endif
#ifdef HAVE_OPENCL
namespace cvtest { namespace ocl {
void dumpOpenCLDevice();
}}
#define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
#else
#define TEST_DUMP_OCL_INFO
#endif
#define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...) \ #define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...) \
CV_TRACE_FUNCTION(); \ CV_TRACE_FUNCTION(); \
@ -654,11 +645,10 @@ void dumpOpenCLDevice();
::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls), \ ::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls), \
argc, argv); \ argc, argv); \
::testing::InitGoogleTest(&argc, argv); \ ::testing::InitGoogleTest(&argc, argv); \
cvtest::printVersionInfo(); \ ::testing::UnitTest::GetInstance()->listeners().Append(new cvtest::SystemInfoCollector); \
::testing::Test::RecordProperty("cv_module_name", #modulename); \ ::testing::Test::RecordProperty("cv_module_name", #modulename); \
::perf::TestBase::RecordRunParameters(); \ ::perf::TestBase::RecordRunParameters(); \
__CV_TEST_EXEC_ARGS(__VA_ARGS__) \ __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
TEST_DUMP_OCL_INFO \
} \ } \
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();

@ -43,25 +43,6 @@
#include "opencv2/ts/ocl_test.hpp" #include "opencv2/ts/ocl_test.hpp"
#ifdef HAVE_OPENCL
#define DUMP_CONFIG_PROPERTY(propertyName, propertyValue) \
do { \
std::stringstream ssName, ssValue;\
ssName << propertyName;\
ssValue << (propertyValue); \
::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
} while (false)
#define DUMP_MESSAGE_STDOUT(msg) \
do { \
std::cout << msg << std::endl; \
} while (false)
#include <opencv2/core/opencl/opencl_info.hpp>
#endif // HAVE_OPENCL
namespace cvtest { namespace cvtest {
namespace ocl { namespace ocl {
@ -69,13 +50,6 @@ using namespace cv;
int test_loop_times = 1; // TODO Read from command line / environment int test_loop_times = 1; // TODO Read from command line / environment
#ifdef HAVE_OPENCL
void dumpOpenCLDevice()
{
cv::dumpOpenCLInformation();
}
#endif // HAVE_OPENCL
Mat TestUtils::readImage(const String &fileName, int flags) Mat TestUtils::readImage(const String &fileName, int flags)
{ {
return cv::imread(cvtest::TS::ptr()->get_data_path() + fileName, flags); return cv::imread(cvtest::TS::ptr()->get_data_path() + fileName, flags);

@ -74,7 +74,26 @@
# include <sys/stat.h> # include <sys/stat.h>
#endif #endif
#ifdef HAVE_OPENCL
#define DUMP_CONFIG_PROPERTY(propertyName, propertyValue) \
do { \
std::stringstream ssName, ssValue;\
ssName << propertyName;\
ssValue << (propertyValue); \
::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
} while (false)
#define DUMP_MESSAGE_STDOUT(msg) \
do { \
std::cout << msg << std::endl; \
} while (false)
#include "opencv2/core/opencl/opencl_info.hpp"
#endif // HAVE_OPENCL
#include "opencv2/core/utility.hpp"
#include "opencv_tests_config.hpp" #include "opencv_tests_config.hpp"
namespace opencv_test { namespace opencv_test {
@ -230,7 +249,6 @@ bool BaseTest::can_do_fast_forward()
void BaseTest::safe_run( int start_from ) void BaseTest::safe_run( int start_from )
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
read_params( ts->get_file_storage() );
ts->update_context( 0, -1, true ); ts->update_context( 0, -1, true );
ts->update_context( this, -1, true ); ts->update_context( this, -1, true );
@ -552,8 +570,6 @@ void TS::set_gtest_status()
} }
CvFileStorage* TS::get_file_storage() { return 0; }
void TS::update_context( BaseTest* test, int test_case_idx, bool update_ts_context ) void TS::update_context( BaseTest* test, int test_case_idx, bool update_ts_context )
{ {
if( current_test_info.test != test ) if( current_test_info.test != test )
@ -614,8 +630,11 @@ void TS::printf( int streams, const char* fmt, ... )
} }
static TS ts; TS* TS::ptr()
TS* TS::ptr() { return &ts; } {
static TS ts;
return &ts;
}
void fillGradient(Mat& img, int delta) void fillGradient(Mat& img, int delta)
{ {
@ -866,6 +885,65 @@ std::string findDataDirectory(const std::string& relative_path, bool required)
return findData(relative_path, required, true); return findData(relative_path, required, true);
} }
inline static std::string getSnippetFromConfig(const std::string & start, const std::string & end)
{
const std::string buildInfo = cv::getBuildInformation();
size_t pos1 = buildInfo.find(start);
if (pos1 != std::string::npos)
{
pos1 += start.length();
pos1 = buildInfo.find_first_not_of(" \t\n\r", pos1);
}
size_t pos2 = buildInfo.find(end, pos1);
if (pos2 != std::string::npos)
{
pos2 = buildInfo.find_last_not_of(" \t\n\r", pos2);
}
if (pos1 != std::string::npos && pos2 != std::string::npos && pos1 < pos2)
{
return buildInfo.substr(pos1, pos2 - pos1 + 1);
}
return std::string();
}
inline static void recordPropertyVerbose(const std::string & property,
const std::string & msg,
const std::string & value,
const std::string & build_value = std::string())
{
::testing::Test::RecordProperty(property, value);
std::cout << msg << ": " << (value.empty() ? std::string("N/A") : value) << std::endl;
if (!build_value.empty())
{
::testing::Test::RecordProperty(property + "_build", build_value);
if (build_value != value)
std::cout << "WARNING: build value differs from runtime: " << build_value << endl;
}
}
#ifdef _DEBUG
#define CV_TEST_BUILD_CONFIG "Debug"
#else
#define CV_TEST_BUILD_CONFIG "Release"
#endif
void SystemInfoCollector::OnTestProgramStart(const testing::UnitTest&)
{
std::cout << "CTEST_FULL_OUTPUT" << std::endl; // Tell CTest not to discard any output
recordPropertyVerbose("cv_version", "OpenCV version", cv::getVersionString(), CV_VERSION);
recordPropertyVerbose("cv_vcs_version", "OpenCV VCS version", getSnippetFromConfig("Version control:", "\n"));
recordPropertyVerbose("cv_build_type", "Build type", getSnippetFromConfig("Configuration:", "\n"), CV_TEST_BUILD_CONFIG);
recordPropertyVerbose("cv_compiler", "Compiler", getSnippetFromConfig("C++ Compiler:", "\n"));
recordPropertyVerbose("cv_parallel_framework", "Parallel framework", cv::currentParallelFramework());
recordPropertyVerbose("cv_cpu_features", "CPU features", cv::getCPUFeaturesLine());
#ifdef HAVE_IPP
recordPropertyVerbose("cv_ipp_version", "Intel(R) IPP version", cv::ipp::useIPP() ? cv::ipp::getIppVersion() : "disabled");
#endif
#ifdef HAVE_OPENCL
cv::dumpOpenCLInformation();
#endif
}
} //namespace cvtest } //namespace cvtest
/* End of file. */ /* End of file. */

@ -2973,143 +2973,6 @@ MatComparator::operator()(const char* expr1, const char* expr2,
<< "- " << expr2 << ":\n" << MatPart(m2part, border > 0 ? &loc : 0) << ".\n"; << "- " << expr2 << ":\n" << MatPart(m2part, border > 0 ? &loc : 0) << ".\n";
} }
void printVersionInfo(bool useStdOut)
{
// Tell CTest not to discard any output
if(useStdOut) std::cout << "CTEST_FULL_OUTPUT" << std::endl;
::testing::Test::RecordProperty("cv_version", CV_VERSION);
if(useStdOut) std::cout << "OpenCV version: " << CV_VERSION << std::endl;
std::string buildInfo( cv::getBuildInformation() );
size_t pos1 = buildInfo.find("Version control");
size_t pos2 = buildInfo.find('\n', pos1);
if(pos1 != std::string::npos && pos2 != std::string::npos)
{
size_t value_start = buildInfo.rfind(' ', pos2) + 1;
std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
::testing::Test::RecordProperty("cv_vcs_version", ver);
if (useStdOut) std::cout << "OpenCV VCS version: " << ver << std::endl;
}
pos1 = buildInfo.find("inner version");
pos2 = buildInfo.find('\n', pos1);
if(pos1 != std::string::npos && pos2 != std::string::npos)
{
size_t value_start = buildInfo.rfind(' ', pos2) + 1;
std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
::testing::Test::RecordProperty("cv_inner_vcs_version", ver);
if(useStdOut) std::cout << "Inner VCS version: " << ver << std::endl;
}
const char * build_type =
#ifdef _DEBUG
"debug";
#else
"release";
#endif
::testing::Test::RecordProperty("cv_build_type", build_type);
if (useStdOut) std::cout << "Build type: " << build_type << std::endl;
const char* parallel_framework = currentParallelFramework();
if (parallel_framework) {
::testing::Test::RecordProperty("cv_parallel_framework", parallel_framework);
if (useStdOut) std::cout << "Parallel framework: " << parallel_framework << std::endl;
}
std::string cpu_features;
#if CV_POPCNT
if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
#endif
#if CV_MMX
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
#endif
#if CV_SSE
if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
#endif
#if CV_SSE2
if (checkHardwareSupport(CV_CPU_SSE2)) cpu_features += " sse2";
#endif
#if CV_SSE3
if (checkHardwareSupport(CV_CPU_SSE3)) cpu_features += " sse3";
#endif
#if CV_SSSE3
if (checkHardwareSupport(CV_CPU_SSSE3)) cpu_features += " ssse3";
#endif
#if CV_SSE4_1
if (checkHardwareSupport(CV_CPU_SSE4_1)) cpu_features += " sse4.1";
#endif
#if CV_SSE4_2
if (checkHardwareSupport(CV_CPU_SSE4_2)) cpu_features += " sse4.2";
#endif
#if CV_AVX
if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
#endif
#if CV_AVX2
if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
#endif
#if CV_FMA3
if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3";
#endif
#if CV_AVX_512F
if (checkHardwareSupport(CV_CPU_AVX_512F)) cpu_features += " avx-512f";
#endif
#if CV_AVX_512BW
if (checkHardwareSupport(CV_CPU_AVX_512BW)) cpu_features += " avx-512bw";
#endif
#if CV_AVX_512CD
if (checkHardwareSupport(CV_CPU_AVX_512CD)) cpu_features += " avx-512cd";
#endif
#if CV_AVX_512DQ
if (checkHardwareSupport(CV_CPU_AVX_512DQ)) cpu_features += " avx-512dq";
#endif
#if CV_AVX_512ER
if (checkHardwareSupport(CV_CPU_AVX_512ER)) cpu_features += " avx-512er";
#endif
#if CV_AVX_512IFMA512
if (checkHardwareSupport(CV_CPU_AVX_512IFMA512)) cpu_features += " avx-512ifma512";
#endif
#if CV_AVX_512PF
if (checkHardwareSupport(CV_CPU_AVX_512PF)) cpu_features += " avx-512pf";
#endif
#if CV_AVX_512VBMI
if (checkHardwareSupport(CV_CPU_AVX_512VBMI)) cpu_features += " avx-512vbmi";
#endif
#if CV_AVX_512VL
if (checkHardwareSupport(CV_CPU_AVX_512VL)) cpu_features += " avx-512vl";
#endif
#if CV_NEON
if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
#endif
#if CV_FP16
if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16";
#endif
#if CV_VSX
if (checkHardwareSupport(CV_CPU_VSX)) cpu_features += " VSX";
#endif
cpu_features.erase(0, 1); // erase initial space
::testing::Test::RecordProperty("cv_cpu_features", cpu_features);
if (useStdOut) std::cout << "CPU features: " << cpu_features << std::endl;
#ifdef HAVE_IPP
const char * ipp_optimization = cv::ipp::useIPP()? "enabled" : "disabled";
::testing::Test::RecordProperty("cv_ipp_optimization", ipp_optimization);
if (useStdOut) std::cout << "Intel(R) IPP optimization: " << ipp_optimization << std::endl;
cv::String ippVer = cv::ipp::getIppVersion();
::testing::Test::RecordProperty("cv_ipp_version", ippVer);
if(useStdOut) std::cout << "Intel(R) IPP version: " << ippVer.c_str() << std::endl;
#endif
}
void threshold( const Mat& _src, Mat& _dst, void threshold( const Mat& _src, Mat& _dst,
double thresh, double maxval, int thresh_type ) double thresh, double maxval, int thresh_type )
{ {

@ -905,7 +905,7 @@ public:
/** @brief Writes the next video frame /** @brief Writes the next video frame
@param image The written frame @param image The written frame. In general, color images are expected in BGR format.
The function/method writes the specified image to video file. It must have the same size as has The function/method writes the specified image to video file. It must have the same size as has
been specified when opening the video writer. been specified when opening the video writer.

@ -811,6 +811,8 @@ void videoDevice::NukeDownstream(IBaseFilter *pBF){
IEnumPins *pins = NULL; IEnumPins *pins = NULL;
PIN_INFO pininfo; PIN_INFO pininfo;
HRESULT hr = pBF->EnumPins(&pins); HRESULT hr = pBF->EnumPins(&pins);
if (hr != S_OK || !pins)
return;
pins->Reset(); pins->Reset();
while (hr == NOERROR) while (hr == NOERROR)
{ {
@ -838,7 +840,7 @@ void videoDevice::NukeDownstream(IBaseFilter *pBF){
pP->Release(); pP->Release();
} }
} }
if (pins) pins->Release(); pins->Release();
} }
@ -999,17 +1001,6 @@ videoDevice::~videoDevice(){
(pGraph) = 0; (pGraph) = 0;
} }
//delete our pointers
delete pDestFilter;
delete pVideoInputFilter;
delete pGrabberF;
delete pGrabber;
delete pControl;
delete streamConf;
delete pMediaEvent;
delete pCaptureGraph;
delete pGraph;
DebugPrintOut("SETUP: Device %i disconnected and freed\n\n",myID); DebugPrintOut("SETUP: Device %i disconnected and freed\n\n",myID);
} }
@ -1654,7 +1645,7 @@ bool videoInput::getVideoSettingFilter(int deviceID, long Property, long &min, l
IAMVideoProcAmp *pAMVideoProcAmp = NULL; IAMVideoProcAmp *pAMVideoProcAmp = NULL;
hr = VD->pVideoInputFilter->QueryInterface(IID_IAMVideoProcAmp, (void**)&pAMVideoProcAmp); hr = VD->pVideoInputFilter->QueryInterface(IID_IAMVideoProcAmp, (void**)&pAMVideoProcAmp);
if(FAILED(hr)){ if(FAILED(hr) || !pAMVideoProcAmp){
DebugPrintOut("setVideoSetting - QueryInterface Error\n"); DebugPrintOut("setVideoSetting - QueryInterface Error\n");
#if 0 #if 0
if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release(); if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
@ -1676,7 +1667,7 @@ bool videoInput::getVideoSettingFilter(int deviceID, long Property, long &min, l
hr = pAMVideoProcAmp->Get(Property, &currentValue, &flags); hr = pAMVideoProcAmp->Get(Property, &currentValue, &flags);
} }
if(pAMVideoProcAmp)pAMVideoProcAmp->Release(); pAMVideoProcAmp->Release();
#if 0 #if 0
if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release(); if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL; if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL;
@ -1881,7 +1872,7 @@ bool videoInput::getVideoSettingCamera(int deviceID, long Property, long &min, l
IAMCameraControl *pIAMCameraControl = NULL; IAMCameraControl *pIAMCameraControl = NULL;
hr = VD->pVideoInputFilter->QueryInterface(IID_IAMCameraControl, (void**)&pIAMCameraControl); hr = VD->pVideoInputFilter->QueryInterface(IID_IAMCameraControl, (void**)&pIAMCameraControl);
if(FAILED(hr)){ if(FAILED(hr) || !pIAMCameraControl){
DebugPrintOut("setVideoSetting - QueryInterface Error\n"); DebugPrintOut("setVideoSetting - QueryInterface Error\n");
#if 0 #if 0
if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release(); if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
@ -1902,7 +1893,7 @@ bool videoInput::getVideoSettingCamera(int deviceID, long Property, long &min, l
hr = pIAMCameraControl->Get(Property, &currentValue, &flags); hr = pIAMCameraControl->Get(Property, &currentValue, &flags);
} }
if(pIAMCameraControl)pIAMCameraControl->Release(); pIAMCameraControl->Release();
#if 0 #if 0
if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release(); if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL; if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL;
@ -2595,7 +2586,7 @@ int videoInput::start(int deviceID, videoDevice *VD){
//we do this because webcams don't have a preview mode //we do this because webcams don't have a preview mode
hr = VD->pCaptureGraph->FindInterface(&CAPTURE_MODE, &MEDIATYPE_Video, VD->pVideoInputFilter, IID_IAMStreamConfig, (void **)&VD->streamConf); hr = VD->pCaptureGraph->FindInterface(&CAPTURE_MODE, &MEDIATYPE_Video, VD->pVideoInputFilter, IID_IAMStreamConfig, (void **)&VD->streamConf);
if(FAILED(hr)){ if(FAILED(hr) || !VD->streamConf){
DebugPrintOut("ERROR: Couldn't config the stream!\n"); DebugPrintOut("ERROR: Couldn't config the stream!\n");
stopDevice(deviceID); stopDevice(deviceID);
return hr; return hr;
@ -2737,14 +2728,8 @@ int videoInput::start(int deviceID, videoDevice *VD){
//lets try freeing our stream conf here too //lets try freeing our stream conf here too
//this will fail if the device is already running //this will fail if the device is already running
if(VD->streamConf){
VD->streamConf->Release(); VD->streamConf->Release();
VD->streamConf = NULL; VD->streamConf = NULL;
}else{
DebugPrintOut("ERROR: connecting device - prehaps it is already being used?\n");
stopDevice(deviceID);
return S_FALSE;
}
//NULL RENDERER// //NULL RENDERER//
@ -3093,7 +3078,7 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
IAMCrossbar *pXBar1 = NULL; IAMCrossbar *pXBar1 = NULL;
HRESULT hr = pBuild->FindInterface(&LOOK_UPSTREAM_ONLY, NULL, pVidFilter, HRESULT hr = pBuild->FindInterface(&LOOK_UPSTREAM_ONLY, NULL, pVidFilter,
IID_IAMCrossbar, (void**)&pXBar1); IID_IAMCrossbar, (void**)&pXBar1);
if (SUCCEEDED(hr)) if (SUCCEEDED(hr) && pXBar1)
{ {
bool foundDevice = false; bool foundDevice = false;
@ -3163,10 +3148,6 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
//we were getting a crash otherwise //we were getting a crash otherwise
//if(Crossbar)Crossbar->Release(); //if(Crossbar)Crossbar->Release();
//if(Crossbar)Crossbar = NULL; //if(Crossbar)Crossbar = NULL;
if(pXBar1)pXBar1->Release();
if(pXBar1)pXBar1 = NULL;
}else{ }else{
DebugPrintOut("SETUP: You are a webcam or snazzy firewire cam! No Crossbar needed\n"); DebugPrintOut("SETUP: You are a webcam or snazzy firewire cam! No Crossbar needed\n");
return hr; return hr;

@ -1224,7 +1224,11 @@ Ptr<IVideoCapture> cv::createGStreamerCapture(int index)
class CvVideoWriter_GStreamer : public CvVideoWriter class CvVideoWriter_GStreamer : public CvVideoWriter
{ {
public: public:
CvVideoWriter_GStreamer() { init(); } CvVideoWriter_GStreamer()
: pipeline(0), source(0), encodebin(0), file(0), buffer(0), input_pix_fmt(0),
num_frames(0), framerate(0)
{
}
virtual ~CvVideoWriter_GStreamer() CV_OVERRIDE { close(); } virtual ~CvVideoWriter_GStreamer() CV_OVERRIDE { close(); }
virtual bool open( const char* filename, int fourcc, virtual bool open( const char* filename, int fourcc,
@ -1232,7 +1236,6 @@ public:
virtual void close(); virtual void close();
virtual bool writeFrame( const IplImage* image ) CV_OVERRIDE; virtual bool writeFrame( const IplImage* image ) CV_OVERRIDE;
protected: protected:
void init();
const char* filenameToMimetype(const char* filename); const char* filenameToMimetype(const char* filename);
GstElement* pipeline; GstElement* pipeline;
GstElement* source; GstElement* source;
@ -1245,22 +1248,6 @@ protected:
double framerate; double framerate;
}; };
/*!
* \brief CvVideoWriter_GStreamer::init
* initialise all variables
*/
void CvVideoWriter_GStreamer::init()
{
pipeline = NULL;
source = NULL;
encodebin = NULL;
file = NULL;
buffer = NULL;
num_frames = 0;
framerate = 0;
}
/*! /*!
* \brief CvVideoWriter_GStreamer::close * \brief CvVideoWriter_GStreamer::close
* ends the pipeline by sending EOS and destroys the pipeline and all * ends the pipeline by sending EOS and destroys the pipeline and all
@ -1282,17 +1269,19 @@ void CvVideoWriter_GStreamer::close()
//wait for EOS to trickle down the pipeline. This will let all elements finish properly //wait for EOS to trickle down the pipeline. This will let all elements finish properly
GstBus* bus = gst_element_get_bus(pipeline); GstBus* bus = gst_element_get_bus(pipeline);
GstMessage *msg = gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, (GstMessageType)(GST_MESSAGE_ERROR | GST_MESSAGE_EOS)); GstMessage *msg = gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, (GstMessageType)(GST_MESSAGE_ERROR | GST_MESSAGE_EOS));
if (GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR) if (!msg || GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR)
{ {
CV_WARN("Error during VideoWriter finalization\n"); CV_WARN("Error during VideoWriter finalization\n");
return;
}
if(msg != NULL) if(msg != NULL)
{ {
gst_message_unref(msg); gst_message_unref(msg);
g_object_unref(G_OBJECT(bus)); g_object_unref(G_OBJECT(bus));
} }
return;
}
gst_message_unref(msg);
g_object_unref(G_OBJECT(bus));
status = gst_element_set_state (pipeline, GST_STATE_NULL); status = gst_element_set_state (pipeline, GST_STATE_NULL);
if (status == GST_STATE_CHANGE_ASYNC) if (status == GST_STATE_CHANGE_ASYNC)

@ -91,7 +91,7 @@ static bool pMFCreateDXGIDeviceManager_initialized = false;
static FN_MFCreateDXGIDeviceManager pMFCreateDXGIDeviceManager = NULL; static FN_MFCreateDXGIDeviceManager pMFCreateDXGIDeviceManager = NULL;
static void init_MFCreateDXGIDeviceManager() static void init_MFCreateDXGIDeviceManager()
{ {
HMODULE h = LoadLibraryA("mfplat.dll"); HMODULE h = LoadLibraryExA("mfplat.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
if (h) if (h)
{ {
pMFCreateDXGIDeviceManager = (FN_MFCreateDXGIDeviceManager)GetProcAddress(h, "MFCreateDXGIDeviceManager"); pMFCreateDXGIDeviceManager = (FN_MFCreateDXGIDeviceManager)GetProcAddress(h, "MFCreateDXGIDeviceManager");
@ -1720,7 +1720,7 @@ bool CvCapture_MSMF::setProperty( int property_id, double value )
return setTime(duration * value, true); return setTime(duration * value, true);
break; break;
case CV_CAP_PROP_POS_FRAMES: case CV_CAP_PROP_POS_FRAMES:
if (getFramerate(nativeFormat) != 0) if (std::fabs(getFramerate(nativeFormat)) > 0)
return setTime(value * 1e7 / getFramerate(nativeFormat), false); return setTime(value * 1e7 / getFramerate(nativeFormat), false);
break; break;
case CV_CAP_PROP_POS_MSEC: case CV_CAP_PROP_POS_MSEC:
@ -1978,7 +1978,17 @@ private:
CvVideoWriter_MSMF::CvVideoWriter_MSMF(): CvVideoWriter_MSMF::CvVideoWriter_MSMF():
MF(Media_Foundation::getInstance()), MF(Media_Foundation::getInstance()),
initiated(false) videoWidth(0),
videoHeight(0),
fps(0),
bitRate(0),
frameSize(0),
encodingFormat(),
inputFormat(),
streamIndex(0),
initiated(false),
rtStart(0),
rtDuration(0)
{ {
} }

@ -377,8 +377,8 @@ LRESULT PASCAL CvCaptureCAM_VFW::frameCallback( HWND hWnd, VIDEOHDR* hdr )
if (!hWnd) return FALSE; if (!hWnd) return FALSE;
capture = (CvCaptureCAM_VFW*)capGetUserData(hWnd); capture = (CvCaptureCAM_VFW*)capGetUserData(hWnd);
if (!capture) return (LRESULT)FALSE;
capture->hdr = hdr; capture->hdr = hdr;
return (LRESULT)TRUE; return (LRESULT)TRUE;
} }

@ -12,26 +12,18 @@
#include "opencv2/imgproc.hpp" #include "opencv2/imgproc.hpp"
#include "opencv2/imgcodecs.hpp" #include "opencv2/imgcodecs.hpp"
#include "opencv2/highgui.hpp" #include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
#include <iostream> #include <iostream>
#include <stdlib.h>
using namespace std; using namespace std;
using namespace cv; using namespace cv;
Mat img0, img1, res1, final; Mat src, img1, mask, final;
Point point; Point point;
vector<Point> pts;
int drag = 0; int drag = 0;
int numpts = 100;
Point* pts = new Point[100];
int var = 0; int var = 0;
int flag = 0; int flag = 0;
int flag1 = 0;
int minx,miny,maxx,maxy,lenx,leny;
void mouseHandler(int, int, int, int, void*); void mouseHandler(int, int, int, int, void*);
@ -40,16 +32,17 @@ void mouseHandler(int event, int x, int y, int, void*)
if (event == EVENT_LBUTTONDOWN && !drag) if (event == EVENT_LBUTTONDOWN && !drag)
{ {
if(flag1 == 0) if (flag == 0)
{ {
if(var==0) if (var == 0)
img1 = img0.clone(); img1 = src.clone();
point = Point(x, y); point = Point(x, y);
circle(img1,point,2,Scalar(0, 0, 255),-1, 8, 0); circle(img1, point, 2, Scalar(0, 0, 255), -1, 8, 0);
pts[var] = point; pts.push_back(point);
var++; var++;
drag = 1; drag = 1;
if(var>1)
if (var > 1)
line(img1,pts[var-2], point, Scalar(0, 0, 255), 2, 8, 0); line(img1,pts[var-2], point, Scalar(0, 0, 255), 2, 8, 0);
imshow("Source", img1); imshow("Source", img1);
@ -59,31 +52,18 @@ void mouseHandler(int event, int x, int y, int, void*)
if (event == EVENT_LBUTTONUP && drag) if (event == EVENT_LBUTTONUP && drag)
{ {
imshow("Source", img1); imshow("Source", img1);
drag = 0; drag = 0;
} }
if (event == EVENT_RBUTTONDOWN)
{
flag1 = 1;
img1 = img0.clone();
for(int i = var; i < numpts ; i++)
pts[i] = point;
if(var!=0) if (event == EVENT_RBUTTONDOWN)
{ {
const Point* pts3[1] = {&pts[0]}; flag = 1;
polylines( img1, pts3, &numpts,1, 1, Scalar(0,0,0), 2, 8, 0); img1 = src.clone();
}
for(int i=0;i<var;i++) if (var != 0)
{ {
minx = min(minx,pts[i].x); polylines( img1, pts, 1, Scalar(0,0,0), 2, 8, 0);
maxx = max(maxx,pts[i].x);
miny = min(miny,pts[i].y);
maxy = max(maxy,pts[i].y);
} }
lenx = maxx - minx;
leny = maxy - miny;
imshow("Source", img1); imshow("Source", img1);
} }
@ -91,71 +71,49 @@ void mouseHandler(int event, int x, int y, int, void*)
if (event == EVENT_RBUTTONUP) if (event == EVENT_RBUTTONUP)
{ {
flag = var; flag = var;
final = Mat::zeros(src.size(), CV_8UC3);
final = Mat::zeros(img0.size(),CV_8UC3); mask = Mat::zeros(src.size(), CV_8UC1);
res1 = Mat::zeros(img0.size(),CV_8UC1);
const Point* pts4[1] = {&pts[0]}; vector<vector<Point> > vpts;
vpts.push_back(pts);
fillPoly(res1, pts4,&numpts, 1, Scalar(255, 255, 255), 8, 0); fillPoly(mask, vpts, Scalar(255, 255, 255), 8, 0);
bitwise_and(img0, img0, final,res1); bitwise_and(src, src, final, mask);
imshow("mask",res1); imshow("Mask", mask);
imwrite("mask.png",res1); imshow("Result", final);
imshow("Source", img1); imshow("Source", img1);
} }
if (event == EVENT_MBUTTONDOWN) if (event == EVENT_MBUTTONDOWN)
{ {
for(int i = 0; i < numpts ; i++) pts.clear();
{
pts[i].x=0;
pts[i].y=0;
}
var = 0; var = 0;
flag1 = 0;
minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
imshow("Source", img0);
drag = 0; drag = 0;
flag = 0;
imshow("Source", src);
} }
} }
static void help() int main(int argc, char **argv)
{ {
cout << "\nThis program demonstrates using mouse events" CommandLineParser parser(argc, argv, "{@input | ../data/lena.jpg | input image}");
"\nCall:\n" parser.about("This program demonstrates using mouse events\n");
"./create_mask <image_name>\n" parser.printMessage();
"\n" cout << "\n\tleft mouse button - set a point to create mask shape\n"
"\tleft mouse button - set a point to create mask shape"
"\n"
"\tright mouse button - create mask from points\n" "\tright mouse button - create mask from points\n"
"\tmiddle mouse button - reset\n" << endl; "\tmiddle mouse button - reset\n";
} String input_image = parser.get<String>("@input");
int main(int argc, char **argv) src = imread(input_image);
{
cv::CommandLineParser parser(argc, argv, "{@input | ../data/lena.jpg | input image}"); if (src.empty())
help();
string input_image = parser.get<string>("@input");
if (input_image.empty())
{ {
parser.printMessage(); printf("Error opening image: %s\n", input_image.c_str());
parser.printErrors();
return 0; return 0;
} }
Mat src = imread(input_image); namedWindow("Source", WINDOW_AUTOSIZE);
minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
img0 = src;
res1 = Mat::zeros(img0.size(),CV_8UC1);
final = Mat::zeros(img0.size(),CV_8UC3);
//////////// source image ///////////////////
namedWindow("Source", 1);
setMouseCallback("Source", mouseHandler, NULL); setMouseCallback("Source", mouseHandler, NULL);
imshow("Source", img0); imshow("Source", src);
waitKey(0); waitKey(0);
return 0; return 0;

@ -0,0 +1,149 @@
/**
* @brief You will learn how to recover an out-of-focus image by Wiener filter
* @author Karpushin Vladislav, karpushin@ngs.ru, https://github.com/VladKarpushin
*/
#include <iostream>
#include "opencv2/imgproc.hpp"
#include "opencv2/imgcodecs.hpp"
using namespace cv;
using namespace std;
void help();
void calcPSF(Mat& outputImg, Size filterSize, int R);
void fftshift(const Mat& inputImg, Mat& outputImg);
void filter2DFreq(const Mat& inputImg, Mat& outputImg, const Mat& H);
void calcWnrFilter(const Mat& input_h_PSF, Mat& output_G, double nsr);
const String keys =
"{help h usage ? | | print this message }"
"{image |original.JPG | input image name }"
"{R |53 | radius }"
"{SNR |5200 | signal to noise ratio}"
;
int main(int argc, char *argv[])
{
help();
CommandLineParser parser(argc, argv, keys);
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
int R = parser.get<int>("R");
int snr = parser.get<int>("SNR");
string strInFileName = parser.get<String>("image");
if (!parser.check())
{
parser.printErrors();
return 0;
}
Mat imgIn;
imgIn = imread(strInFileName, IMREAD_GRAYSCALE);
if (imgIn.empty()) //check whether the image is loaded or not
{
cout << "ERROR : Image cannot be loaded..!!" << endl;
return -1;
}
Mat imgOut;
//! [main]
// it needs to process even image only
Rect roi = Rect(0, 0, imgIn.cols & -2, imgIn.rows & -2);
//Hw calculation (start)
Mat Hw, h;
calcPSF(h, roi.size(), R);
calcWnrFilter(h, Hw, 1.0 / double(snr));
//Hw calculation (stop)
// filtering (start)
filter2DFreq(imgIn(roi), imgOut, Hw);
// filtering (stop)
//! [main]
imgOut.convertTo(imgOut, CV_8U);
normalize(imgOut, imgOut, 0, 255, NORM_MINMAX);
imwrite("result.jpg", imgOut);
return 0;
}
void help()
{
cout << "2018-07-12" << endl;
cout << "DeBlur_v8" << endl;
cout << "You will learn how to recover an out-of-focus image by Wiener filter" << endl;
}
//! [calcPSF]
void calcPSF(Mat& outputImg, Size filterSize, int R)
{
Mat h(filterSize, CV_32F, Scalar(0));
Point point(filterSize.width / 2, filterSize.height / 2);
circle(h, point, R, 255, -1, 8);
Scalar summa = sum(h);
outputImg = h / summa[0];
}
//! [calcPSF]
//! [fftshift]
void fftshift(const Mat& inputImg, Mat& outputImg)
{
outputImg = inputImg.clone();
int cx = outputImg.cols / 2;
int cy = outputImg.rows / 2;
Mat q0(outputImg, Rect(0, 0, cx, cy));
Mat q1(outputImg, Rect(cx, 0, cx, cy));
Mat q2(outputImg, Rect(0, cy, cx, cy));
Mat q3(outputImg, Rect(cx, cy, cx, cy));
Mat tmp;
q0.copyTo(tmp);
q3.copyTo(q0);
tmp.copyTo(q3);
q1.copyTo(tmp);
q2.copyTo(q1);
tmp.copyTo(q2);
}
//! [fftshift]
//! [filter2DFreq]
void filter2DFreq(const Mat& inputImg, Mat& outputImg, const Mat& H)
{
Mat planes[2] = { Mat_<float>(inputImg.clone()), Mat::zeros(inputImg.size(), CV_32F) };
Mat complexI;
merge(planes, 2, complexI);
dft(complexI, complexI, DFT_SCALE);
Mat planesH[2] = { Mat_<float>(H.clone()), Mat::zeros(H.size(), CV_32F) };
Mat complexH;
merge(planesH, 2, complexH);
Mat complexIH;
mulSpectrums(complexI, complexH, complexIH, 0);
idft(complexIH, complexIH);
split(complexIH, planes);
outputImg = planes[0];
}
//! [filter2DFreq]
//! [calcWnrFilter]
void calcWnrFilter(const Mat& input_h_PSF, Mat& output_G, double nsr)
{
Mat h_PSF_shifted;
fftshift(input_h_PSF, h_PSF_shifted);
Mat planes[2] = { Mat_<float>(h_PSF_shifted.clone()), Mat::zeros(h_PSF_shifted.size(), CV_32F) };
Mat complexI;
merge(planes, 2, complexI);
dft(complexI, complexI);
split(complexI, planes);
Mat denom;
pow(abs(planes[0]), 2, denom);
denom += nsr;
divide(planes[0], denom, output_G);
}
//! [calcWnrFilter]

@ -190,7 +190,7 @@ while cv.waitKey(1) < 0:
net.setInput(blob) net.setInput(blob)
if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN
frame = cv.resize(frame, (inpWidth, inpHeight)) frame = cv.resize(frame, (inpWidth, inpHeight))
net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info') net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
outs = net.forward(getOutputsNames(net)) outs = net.forward(getOutputsNames(net))
postprocess(frame, outs) postprocess(frame, outs)

Loading…
Cancel
Save