Merge remote-tracking branch 'upstream/3.4' into merge-3.4

7 years ago · 82c477c9f7
parent 57bead3a5e 67f79aabdd
commit 82c477c9f7
71 changed files with 2652 additions and 1386 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1402,15 +1402,19 @@ if(WITH_HALIDE OR HAVE_HALIDE)
  status("    Halide:"     HAVE_HALIDE      THEN "YES (${HALIDE_LIBRARIES} ${HALIDE_INCLUDE_DIRS})" ELSE NO)
 endif()
-if(WITH_INF_ENGINE OR HAVE_INF_ENGINE)
+if(WITH_INF_ENGINE OR INF_ENGINE_TARGET)
-  if(HAVE_INF_ENGINE)
+  if(INF_ENGINE_TARGET)
-    set(__msg "YES")
+    set(__msg "YES (${INF_ENGINE_RELEASE} / ${INF_ENGINE_VERSION})")
-    if(DEFINED INF_ENGINE_VERSION)
+    get_target_property(_lib ${INF_ENGINE_TARGET} IMPORTED_LOCATION)
-      set(__msg "YES (ver ${INF_ENGINE_VERSION})")
+    if(NOT _lib)
-    endif()
+      get_target_property(_lib_rel ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_RELEASE)
      get_target_property(_lib_dbg ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_DEBUG)
      set(_lib "${_lib_rel} / ${_lib_dbg}")
    endif()
    get_target_property(_inc ${INF_ENGINE_TARGET} INTERFACE_INCLUDE_DIRECTORIES)
    status("    Inference Engine:" "${__msg}")
-    status("                libs:" "${INF_ENGINE_LIBRARIES}")
+    status("                libs:" "${_lib}")
-    status("            includes:" "${INF_ENGINE_INCLUDE_DIRS}")
+    status("            includes:" "${_inc}")
  else()
    status("    Inference Engine:"     "NO")
  endif()
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -700,12 +700,21 @@ macro(ocv_compiler_optimization_fill_cpu_config)
    list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT})
  endforeach()
  list(REMOVE_DUPLICATES __dispatch_modes)
  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "")
  foreach(OPT ${__dispatch_modes})
    set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
 #define CV_CPU_DISPATCH_COMPILE_${OPT} 1")
  endforeach()
  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
 \n\n#define CV_CPU_DISPATCH_FEATURES 0 \\")
  foreach(OPT ${__dispatch_modes})
    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
      set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
    , CV_CPU_${OPT} \\")
    endif()
  endforeach()
  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}\n")
  set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n")
  foreach(OPT ${CPU_ALL_OPTIMIZATIONS})
    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@ -1,79 +1,87 @@
 # The script detects Intel(R) Inference Engine installation
 #
-# Parameters:
+# Cache variables:
-# INTEL_CVSDK_DIR - Path to Inference Engine root folder
+# INF_ENGINE_OMP_DIR - directory with OpenMP library to link with (needed by some versions of IE)
-# IE_PLUGINS_PATH - Path to folder with Inference Engine plugins
+# INF_ENGINE_RELEASE - a number reflecting IE source interface (linked with OpenVINO release)
 #
-# On return this will define:
+# Detect parameters:
 # 1. Native cmake IE package:
 #    - enironment variable InferenceEngine_DIR is set to location of cmake module
 # 2. Custom location:
 #    - INF_ENGINE_INCLUDE_DIRS - headers search location
 #    - INF_ENGINE_LIB_DIRS     - library search location
 # 3. OpenVINO location:
 #    - environment variable INTEL_CVSDK_DIR is set to location of OpenVINO installation dir
 #    - INF_ENGINE_PLATFORM - part of name of library directory representing its platform (default ubuntu_16.04)
 #
-# HAVE_INF_ENGINE          - True if Intel Inference Engine was found
+# Result:
-# INF_ENGINE_INCLUDE_DIRS  - Inference Engine include folder
+# INF_ENGINE_TARGET - set to name of imported library target representing InferenceEngine
 # INF_ENGINE_LIBRARIES     - Inference Engine libraries and it's dependencies
 #
 macro(ie_fail)
    set(HAVE_INF_ENGINE FALSE)
    return()
 endmacro()
-find_package(InferenceEngine QUIET)
+if(NOT HAVE_CXX11)
-if(InferenceEngine_FOUND)
+    message(WARNING "DL Inference engine requires C++11. You can turn it on via ENABLE_CXX11=ON CMake flag.")
  set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
  set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
  set(HAVE_INF_ENGINE TRUE)
    return()
 endif()
-ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH)
+# =======================
-if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
+function(add_custom_ie_build _inc _lib _lib_rel _lib_dbg _msg)
-    set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
+  if(NOT _inc OR NOT (_lib OR _lib_rel OR _lib_dbg))
-    if(DEFINED INTEL_CVSDK_DIR)
+    return()
        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
  endif()
-
+  add_library(inference_engine UNKNOWN IMPORTED)
-    if(NOT ie_root_paths)
+  set_target_properties(inference_engine PROPERTIES
-        list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/")
+    IMPORTED_LOCATION "${_lib}"
    IMPORTED_IMPLIB_RELEASE "${_lib_rel}"
    IMPORTED_IMPLIB_DEBUG "${_lib_dbg}"
    INTERFACE_INCLUDE_DIRECTORIES "${_inc}"
  )
  find_library(omp_lib iomp5 PATHS "${INF_ENGINE_OMP_DIR}" NO_DEFAULT_PATH)
  if(NOT omp_lib)
    message(WARNING "OpenMP for IE have not been found. Set INF_ENGINE_OMP_DIR variable if you experience build errors.")
  else()
    set_target_properties(inference_engine PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${omp_lib}")
  endif()
  set(INF_ENGINE_VERSION "Unknown" CACHE STRING "")
  set(INF_ENGINE_TARGET inference_engine PARENT_SCOPE)
  message(STATUS "Detected InferenceEngine: ${_msg}")
 endfunction()
-    find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths})
+# ======================
    if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
      unset(INF_ENGINE_ROOT_DIR CACHE)
    endif()
 endif()
-set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory")
+find_package(InferenceEngine QUIET)
-
+if(InferenceEngine_FOUND)
-if(NOT INF_ENGINE_ROOT_DIR
+  set(INF_ENGINE_TARGET IE::inference_engine)
-    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}"
+  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}" CACHE STRING "")
-    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp"
+  message(STATUS "Detected InferenceEngine: cmake package")
 )
    message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
    ie_fail()
 endif()
-set(INF_ENGINE_LIBRARIES "")
+if(NOT INF_ENGINE_TARGET AND INF_ENGINE_LIB_DIRS AND INF_ENGINE_INCLUDE_DIRS)
-
+  find_path(ie_custom_inc "inference_engine.hpp" PATHS "${INF_ENGINE_INCLUDE_DIRS}" NO_DEFAULT_PATH)
-set(ie_lib_list inference_engine)
+  find_library(ie_custom_lib "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}" NO_DEFAULT_PATH)
  find_library(ie_custom_lib_rel "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Release" NO_DEFAULT_PATH)
  find_library(ie_custom_lib_dbg "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Debug" NO_DEFAULT_PATH)
  add_custom_ie_build("${ie_custom_inc}" "${ie_custom_lib}" "${ie_custom_lib_rel}" "${ie_custom_lib_dbg}" "INF_ENGINE_{INCLUDE,LIB}_DIRS")
 endif()
-if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}")
+set(_loc "$ENV{INTEL_CVSDK_DIR}")
-  set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}")
+if(NOT INF_ENGINE_TARGET AND _loc)
  set(INF_ENGINE_PLATFORM "ubuntu_16.04" CACHE STRING "InferenceEngine platform (library dir)")
  find_path(ie_custom_env_inc "inference_engine.hpp" PATHS "${_loc}/deployment_tools/inference_engine/include" NO_DEFAULT_PATH)
  find_library(ie_custom_env_lib "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/${INF_ENGINE_PLATFORM}/intel64" NO_DEFAULT_PATH)
  find_library(ie_custom_env_lib_rel "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Release" NO_DEFAULT_PATH)
  find_library(ie_custom_env_lib_dbg "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Debug" NO_DEFAULT_PATH)
  add_custom_ie_build("${ie_custom_env_inc}" "${ie_custom_env_lib}" "${ie_custom_env_lib_rel}" "${ie_custom_env_lib_dbg}" "OpenVINO (${_loc})")
 endif()
-link_directories(
+# Add more features to the target
  ${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
  ${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
 )
-foreach(lib ${ie_lib_list})
+if(INF_ENGINE_TARGET)
-    find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH})
+  if(NOT INF_ENGINE_RELEASE)
-    if(NOT ${lib})
+    message(WARNING "InferenceEngine version have not been set, 2018R2 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
        message(WARNING "DL IE: Can't find library: '${lib}'")
        ie_fail()
  endif()
-    list(APPEND INF_ENGINE_LIBRARIES ${${lib}})
+  set(INF_ENGINE_RELEASE "2018020000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2018R2.0.2 -> 2018020002)")
-endforeach()
+  set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
-
+    INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
-set(HAVE_INF_ENGINE TRUE)
+  )
 endif()
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -1132,7 +1132,7 @@ function(ocv_add_perf_tests)
      source_group("Src" FILES "${${the_target}_pch}")
      ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}")
-      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_PERF_${the_module}_DEPS})
      add_dependencies(opencv_perf_tests ${the_target})
      set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};PerfTest")
@ -1175,7 +1175,7 @@ function(ocv_add_perf_tests)
 endfunction()
 # this is a command for adding OpenCV accuracy/regression tests to the module
-# ocv_add_accuracy_tests([FILES <source group name> <list of sources>] [DEPENDS_ON] <list of extra dependencies>)
+# ocv_add_accuracy_tests(<list of extra dependencies>)
 function(ocv_add_accuracy_tests)
  ocv_debug_message("ocv_add_accuracy_tests(" ${ARGN} ")")
@ -1211,7 +1211,7 @@ function(ocv_add_accuracy_tests)
      source_group("Src" FILES "${${the_target}_pch}")
      ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
-      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_TEST_${the_module}_DEPS})
      add_dependencies(opencv_tests ${the_target})
      set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};AccuracyTest")
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@ -1016,3 +1016,17 @@
  year = {2017},
  organization = {IEEE}
 }
@ARTICLE{gonzalez,
  title={Digital Image Fundamentals, Digital Imaging Processing},
  author={Gonzalez, Rafael C and others},
  year={1987},
  publisher={Addison Wesley Publishing Company}
 }
@ARTICLE{gruzman,
  title={Цифровая обработка изображений в информационных системах},
  author={Грузман, И.С. and Киричук, В.С. and Косых, В.П. and Перетягин, Г.И. and Спектор, А.А.},
  year={2000},
  publisher={Изд-во НГТУ Новосибирск}
 }
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/original.jpg
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/original.jpg
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/psf.png
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/psf.png
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/recovered.jpg
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/recovered.jpg
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
@ -0,0 +1,112 @@
 Out-of-focus Deblur Filter {#tutorial_out_of_focus_deblur_filter}
 ==========================
 Goal
 ----
 In this tutorial you will learn:
 -   what is a degradation image model
 -   what is PSF of out-of-focus image
 -   how to restore a blurred image
 -   what is Wiener filter
 Theory
 ------
@note The explanation is based on the books @cite gonzalez and @cite gruzman. Also, you can refer to Matlab's tutorial [Image Deblurring in Matlab] and an article [SmartDeblur].
@note An out-of-focus image on this page is a real world  image. An out-of-focus was done manually by camera optics.
 ### What is a degradation image model?
 A mathematical model of the image degradation in frequency domain representation is:
 \f[S = H\cdot U + N\f]
 where
 \f$S\f$ is a spectrum of blurred (degraded) image,
 \f$U\f$ is a spectrum of original true (undegraded) image,
 \f$H\f$ is frequency response of point spread function (PSF),
 \f$N\f$ is a spectrum of additive noise.
 Circular PSF is a good approximation of out-of-focus distortion. Such PSF is specified by only one parameter - radius \f$R\f$. Circular PSF is used in this work.
 ![Circular point spread function](psf.png)
 ### How to restore an blurred image?
 The objective of restoration (deblurring) is to obtain an estimate of the original image. Restoration formula in frequency domain is:
 \f[U' = H_w\cdot S\f]
 where
 \f$U'\f$ is spectrum of estimation of original image \f$U\f$,
 \f$H_w\f$ is restoration filter, for example, Wiener filter.
 ### What is Wiener filter?
 Wiener filter is a way to restore a blurred image. Let's suppose that PSF is a real and symmetric signal, a power spectrum of the original true image and noise are not known,
 then simplified Wiener formula is:
 \f[H_w = \frac{H}{|H|^2+\frac{1}{SNR}} \f]
 where
 \f$SNR\f$ is signal-to-noise ratio.
 So, in order to recover an out-of-focus image by Wiener filter, it needs to know \f$SNR\f$ and \f$R\f$ of circular PSF.
 Source code
 -----------
 You can find source code in the `samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp` of the OpenCV source code library.
@include cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
 Explanation
 -----------
 An out-of-focus image recovering algorithm consists of PSF generation, Wiener filter generation and filtering an blurred image in frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp main
 A function calcPSF() forms an circular PSF according to input parameter radius \f$R\f$:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcPSF
 A function calcWnrFilter() synthesizes simplified Wiener filter \f$H_w\f$ according to formula described above:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcWnrFilter
 A function fftshift() rearranges PSF. This code was just copied from tutorial @ref tutorial_discrete_fourier_transform "Discrete Fourier Transform":
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp fftshift
 A function filter2DFreq() filters an blurred image in frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp filter2DFreq
 Result
 ------
 Below you can see real out-of-focus image:
 ![Out-of-focus image](images/original.jpg)
 Below result was done by \f$R\f$ = 53 and \f$SNR\f$ = 5200 parameters:
 ![The restored (deblurred) image](images/recovered.jpg)
 The Wiener filter was used, values of \f$R\f$ and \f$SNR\f$ were selected manually to give the best possible visual result.
 We can see that the result is not perfect, but it gives us a hint to the image content. With some difficulty, the text is readable.
@note The parameter \f$R\f$ is the most important. So you should adjust \f$R\f$ first, then \f$SNR\f$.
@note Sometimes you can observe the ringing effect in an restored image. This effect can be reduced by several methods. For example, you can taper input image edges.
 You can also find a quick video demonstration of this on
 [YouTube](https://youtu.be/0bEcE4B0XP4).
@youtube{0bEcE4B0XP4}
 References
 ------
 - [Image Deblurring in Matlab] - Image Deblurring in Matlab
 - [SmartDeblur] - SmartDeblur site
 <!-- invisible references list -->
 [Digital Image Processing]: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/Digital_Image_Processing_2ndEd.pdf
 [Image Deblurring in Matlab]: https://www.mathworks.com/help/images/image-deblurring.html
 [SmartDeblur]: http://yuzhikov.com/articles/BlurredImagesRestoration1.htm
--- a/doc/tutorials/imgproc/table_of_content_imgproc.markdown
+++ b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
@ -292,3 +292,13 @@ In this section you will learn about the image processing (manipulation) functio
    *Author:* Theodore Tsesmelis
    Where we learn to segment objects using Laplacian filtering, the Distance Transformation and the Watershed algorithm.
 -   @subpage tutorial_out_of_focus_deblur_filter
    *Languages:* C++
    *Compatibility:* \> OpenCV 2.0
    *Author:* Karpushin Vladislav
    You will learn how to recover an out-of-focus image by Wiener filter.
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -60,6 +60,17 @@
 // access from within opencv code more accessible
 namespace cv {
 namespace hal {
 enum StoreMode
 {
    STORE_UNALIGNED = 0,
    STORE_ALIGNED = 1,
    STORE_ALIGNED_NOCACHE = 2
 };
 }
 template<typename _Tp> struct V_TypeTraits
 {
 };
@ -154,7 +165,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
 // Correspondingly, the wide intrinsics (which are mapped to the "widest"
 // available instruction set) will get vx_ prefix
-// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
 #if CV_AVX2
 #include "opencv2/core/hal/intrin_avx.hpp"
@ -214,14 +225,16 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
 #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@ -316,7 +329,7 @@ template<typename _Tp> struct V_RegTraits
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    inline void vx_cleanup() { v256_cleanup(); }
-#elif CV_SIMD128
+#elif CV_SIMD128 || CV_SIMD128_CPP
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
    typedef v_uint16x8  v_uint16;
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -304,6 +304,17 @@ inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1
    { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
    { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
    { _mm256_stream_si256((__m256i*)ptr, a.val); }                    \
    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
    { \
        if( mode == hal::STORE_UNALIGNED ) \
            _mm256_storeu_si256((__m256i*)ptr, a.val); \
        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
            _mm256_stream_si256((__m256i*)ptr, a.val); \
        else \
            _mm256_store_si256((__m256i*)ptr, a.val); \
    } \
    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
@ -338,6 +349,17 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
    { _mm256_storeu_##suffix(ptr, a.val); }                               \
    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
    { _mm256_store_##suffix(ptr, a.val); }                                \
    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
    { _mm256_stream_##suffix(ptr, a.val); }                               \
    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
    { \
        if( mode == hal::STORE_UNALIGNED ) \
            _mm256_storeu_##suffix(ptr, a.val); \
        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
            _mm256_stream_##suffix(ptr, a.val); \
        else \
            _mm256_store_##suffix(ptr, a.val); \
    } \
    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
    { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
@ -407,6 +429,11 @@ inline v_float16x16 v256_load_f16(const short* ptr)
 inline v_float16x16 v256_load_f16_aligned(const short* ptr)
 { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
 inline v_float16x16 v256_load_f16_low(const short* ptr)
 { return v_float16x16(v256_load_low(ptr).val); }
 inline v_float16x16 v256_load_f16_halves(const short* ptr0, const short* ptr1)
 { return v_float16x16(v256_load_halves(ptr0, ptr1).val); }
 inline void v_store(short* ptr, const v_float16x16& a)
 { _mm256_storeu_si256((__m256i*)ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x16& a)
@ -819,93 +846,79 @@ OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
 template<int imm>
 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+    enum {IMM_R = (16 - imm) & 0xFF};
-
+    enum {IMM_R2 = (32 - imm) & 0xFF};
    switch(imm)
    {
        case 0:  return a;
        case 32: return b;
        case 16: return v_uint8x32(swap);
    }
-    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
+    if (imm == 0)  return a;
-    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
+    if (imm == 32) return b;
    if (imm > 32)  return v_uint8x32();
-    return v_uint8x32();
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
    if (imm == 16) return v_uint8x32(swap);
    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
    return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
 }
 template<int imm>
 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+    enum {IMM_L = (imm - 16) & 0xFF};
-    switch(imm)
+    if (imm == 0)  return a;
-    {
+    if (imm == 32) return b;
-        case 0:  return a;
+    if (imm > 32)  return v_uint8x32();
        case 32: return b;
        case 16: return v_uint8x32(swap);
    }
    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
    if (imm == 16) return v_uint8x32(swap);
    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
-    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
+    return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
    return v_uint8x32();
 }
 template<int imm>
 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
 {
-    v_uint8x32 res;
+    enum {IMM_L = (imm - 16) & 0xFF};
    enum {IMM_R = (16 - imm) & 0xFF};
    if (imm == 0) return a;
    if (imm > 32) return v_uint8x32();
    // ESAC control[3] ? [127:0] = 0
    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
-
+    if (imm == 16) return v_uint8x32(swapz);
-    if (imm == 0)
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
-        return a;
+    return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
    if (imm == 16)
        res.val = swapz;
    else if (imm < 16)
        res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
    else if (imm < 32)
        res.val = _mm256_slli_si256(swapz, imm - 16);
    else
        return v_uint8x32();
    return res;
 }
 template<int imm>
 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
 {
-    v_uint8x32 res;
+    enum {IMM_L = (imm - 16) & 0xFF};
    if (imm == 0) return a;
    if (imm > 32) return v_uint8x32();
    // ESAC control[3] ? [127:0] = 0
    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
-
+    if (imm == 16) return v_uint8x32(swapz);
-    if (imm == 0)
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
-        return a;
+    return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
    if (imm == 16)
        res.val = swapz;
    else if (imm < 16)
        res.val = _mm256_alignr_epi8(swapz, a.val, imm);
    else if (imm < 32)
        res.val = _mm256_srli_si256(swapz, imm - 16);
    else
        return v_uint8x32();
    return res;
 }
 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)     \
    template<int imm>                                             \
    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
    {                                                             \
-        const int w = sizeof(typename _Tpvec::lane_type);       \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
-        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a),  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
                                       v_reinterpret_as_u8(b));   \
        return _Tpvec(cast(ret.val));                             \
    }                                                             \
    template<int imm>                                             \
    inline _Tpvec intrin(const _Tpvec& a)                         \
    {                                                             \
-        const int w = sizeof(typename _Tpvec::lane_type);       \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
-        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
        return _Tpvec(cast(ret.val));                             \
    }
@ -1616,7 +1629,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b
    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
-    static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                                               0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@ -1633,7 +1646,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
-    static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
                                               0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@ -1683,16 +1696,16 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
                                               -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
-    static const __m256i
+    const __m256i
    sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
                            0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
    sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
@ -1717,18 +1730,18 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
-    static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+    const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
                                                 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+    const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
                                                 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
-    static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+    const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
                                                 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
    b0 = _mm256_shuffle_epi8(b0, sh_b);
    g0 = _mm256_shuffle_epi8(g0, sh_g);
@ -1785,7 +1798,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
-    static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                               0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
@ -1820,7 +1833,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
-    static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
                                               0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
@ -1901,7 +1914,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g
 ///////////////////////////// store interleave /////////////////////////////////////
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
    __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
@ -1909,11 +1923,25 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x3
    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, xy0);
        _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, xy0);
        _mm256_store_si256((__m256i*)(ptr + 32), xy1);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, xy0);
        _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
    }
 }
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
    __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
@ -1921,11 +1949,25 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint1
    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, xy0);
        _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, xy0);
        _mm256_store_si256((__m256i*)(ptr + 16), xy1);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, xy0);
        _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
    }
 }
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
    __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
@ -1933,11 +1975,25 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint
    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, xy0);
        _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, xy0);
        _mm256_store_si256((__m256i*)(ptr + 8), xy1);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, xy0);
        _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
    }
 }
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
    __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
@ -1945,19 +2001,33 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64
    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, xy0);
        _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, xy0);
        _mm256_store_si256((__m256i*)(ptr + 4), xy1);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, xy0);
        _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
    }
 }
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
-    static const __m256i sh_b = _mm256_setr_epi8(
+    const __m256i sh_b = _mm256_setr_epi8(
            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
-    static const __m256i sh_g = _mm256_setr_epi8(
+    const __m256i sh_g = _mm256_setr_epi8(
            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
-    static const __m256i sh_r = _mm256_setr_epi8(
+    const __m256i sh_r = _mm256_setr_epi8(
            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
@ -1965,9 +2035,9 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
-    static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+    const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
                                               0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@ -1978,20 +2048,36 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
    __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
    __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgr0);
        _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
        _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgr0);
        _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
        _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgr0);
        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
        _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
    }
 }
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
-    static const __m256i sh_b = _mm256_setr_epi8(
+    const __m256i sh_b = _mm256_setr_epi8(
         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m256i sh_g = _mm256_setr_epi8(
+    const __m256i sh_g = _mm256_setr_epi8(
         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
-    static const __m256i sh_r = _mm256_setr_epi8(
+    const __m256i sh_r = _mm256_setr_epi8(
         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
@ -1999,9 +2085,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@ -2012,12 +2098,28 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
    //__m256i bgr1 = p1;
    __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgr0);
        _mm256_stream_si256((__m256i*)(ptr + 16), p1);
        _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgr0);
        _mm256_store_si256((__m256i*)(ptr + 16), p1);
        _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgr0);
        _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
    }
 }
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
    __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
@ -2031,12 +2133,28 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
    //__m256i bgr1 = p2;
    __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgr0);
        _mm256_stream_si256((__m256i*)(ptr + 8), p2);
        _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgr0);
        _mm256_store_si256((__m256i*)(ptr + 8), p2);
        _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgr0);
        _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
        _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
    }
 }
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
    __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
@ -2046,12 +2164,29 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
    __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
    __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgr0);
        _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
        _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgr0);
        _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
        _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgr0);
        _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
        _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
    }
 }
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
                                const v_uint8x32& r, const v_uint8x32& a,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
    __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
@ -2068,14 +2203,32 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgra0);
        _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
        _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
        _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgra0);
        _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
        _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
        _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgra0);
        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
        _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
        _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
    }
 }
 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
-                                const v_uint16x16& r, const v_uint16x16& a )
+                                const v_uint16x16& r, const v_uint16x16& a,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
    __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
@ -2092,14 +2245,32 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgra0);
        _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
        _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
        _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgra0);
        _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
        _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
        _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgra0);
        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
        _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
    }
 }
 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
-                                const v_uint32x8& r, const v_uint32x8& a )
+                                const v_uint32x8& r, const v_uint32x8& a,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
    __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
@ -2116,14 +2287,32 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgra0);
        _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
        _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
        _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgra0);
        _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
        _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
        _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgra0);
        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
        _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
    }
 }
 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
-                                const v_uint64x4& r, const v_uint64x4& a )
+                                const v_uint64x4& r, const v_uint64x4& a,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
    __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
    __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
@ -2135,10 +2324,27 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
    __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
    __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm256_stream_si256((__m256i*)ptr, bgra0);
        _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
        _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
        _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm256_store_si256((__m256i*)ptr, bgra0);
        _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
        _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
        _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
    }
    else
    {
        _mm256_storeu_si256((__m256i*)ptr, bgra0);
        _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
        _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
    }
 }
 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@ -2166,27 +2372,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
    c0 = v_reinterpret_as_##suffix0(c1); \
    d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
 } \
 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
-                                const _Tpvec0& c0, const _Tpvec0& d0 ) \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
 }
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -1319,7 +1319,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b)
+                               const v_reg<_Tp, n>& b,
                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
@ -1339,7 +1340,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
@ -1360,7 +1362,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
+                                                            const v_reg<_Tp, n>& d,
                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }
 template<typename _Tp, int n>
 inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
    for( int i = 0; i < n; i++ )
        ptr[i] = a.s[i];
 }
 template<typename _Tp, int n>
 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
 {
    for( int i = 0; i < n; i++ )
        ptr[i] = a.s[i];
 }
 /** @brief Combine vector from first elements of two vectors
 Scheme:
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -319,6 +319,9 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 #endif
 }
 #ifndef vdup_n_f16
    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
 #endif
 struct v_float16x8
 {
@ -864,6 +867,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -889,6 +896,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }
 inline v_float16x8 v_load_f16_low(const short* ptr)
 { return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
 inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
 { return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
 inline void v_store(short* ptr, const v_float16x8& a)
 { cv_vst1q_f16(ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)
@ -1292,14 +1304,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
    c.val = v.val[2]; \
    d.val = v.val[3]; \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x2_t v; \
    v.val[0] = a.val; \
    v.val[1] = b.val; \
    vst2q_##suffix(ptr, v); \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x3_t v; \
    v.val[0] = a.val; \
@ -1308,7 +1322,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst3q_##suffix(ptr, v); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec##x4_t v; \
    v.val[0] = a.val; \
@ -1360,7 +1375,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
 } \
 \
-inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@ -1369,7 +1385,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
-                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@ -1380,7 +1397,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
-                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -788,7 +788,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
-    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
    __m128 t = x.val;
    __m128 h = _mm_mul_ps(t, _0_5);
    t = _mm_rsqrt_ps(t);
@ -801,7 +801,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
-    static const __m128d v_1 = _mm_set1_pd(1.);
+    const __m128d v_1 = _mm_set1_pd(1.);
    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
 }
@ -1261,6 +1261,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { _mm_store_si128((__m128i*)ptr, a.val); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
 { _mm_stream_si128((__m128i*)ptr, a.val); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
 { \
    if( mode == hal::STORE_UNALIGNED ) \
        _mm_storeu_si128((__m128i*)ptr, a.val); \
    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
        _mm_stream_si128((__m128i*)ptr, a.val); \
    else \
        _mm_store_si128((__m128i*)ptr, a.val); \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1292,6 +1303,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { _mm_storeu_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { _mm_store_##suffix(ptr, a.val); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
 { _mm_stream_##suffix(ptr, a.val); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
 { \
    if( mode == hal::STORE_UNALIGNED ) \
        _mm_storeu_##suffix(ptr, a.val); \
    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
        _mm_stream_##suffix(ptr, a.val); \
    else \
        _mm_store_##suffix(ptr, a.val); \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1308,6 +1330,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
 inline v_float16x8 v_load_f16_low(const short* ptr)
 { return v_float16x8(v_load_low(ptr).val); }
 inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
 { return v_float16x8(v_load_halves(ptr0, ptr1).val); }
 inline void v_store(short* ptr, const v_float16x8& a)
 { _mm_storeu_si128((__m128i*)ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)
@ -1671,17 +1698,17 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
 {
 #if CV_SSE4_1
-    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
-    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
-    static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
-    static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
-    static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
    a0 = _mm_shuffle_epi8(a0, sh_b);
    b0 = _mm_shuffle_epi8(b0, sh_g);
    c0 = _mm_shuffle_epi8(c0, sh_r);
@ -1689,9 +1716,9 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    b.val = b0;
    c.val = c0;
 #elif CV_SSSE3
-    static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
-    static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
-    static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+    const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@ -1784,9 +1811,9 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
-    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
-    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
    a0 = _mm_shuffle_epi8(a0, sh_a);
    b0 = _mm_shuffle_epi8(b0, sh_b);
    c0 = _mm_shuffle_epi8(c0, sh_c);
@ -1955,55 +1982,61 @@ inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
 // store interleave
-inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
    __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 16), v1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 16), v1);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
    }
 }
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
-                                const v_uint8x16& c )
+                                const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
 #if CV_SSE4_1
-    static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
-    static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
-    static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
-    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
-    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
    _mm_storeu_si128((__m128i*)(ptr), v0);
    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
 #elif CV_SSSE3
-    static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
-    static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
-    static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+    const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
    __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
    t0 = _mm_alignr_epi8(c.val, t0, 5);
-    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i v0 = _mm_shuffle_epi8(t0, m0);
    __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
    t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
-    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i v1 = _mm_shuffle_epi8(t1, m1);
    __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
    t2 = _mm_alignr_epi8(t2, a.val, 11);
-    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+    __m128i v2 = _mm_shuffle_epi8(t2, m2);
    _mm_storeu_si128((__m128i*)ptr, s0);
    _mm_storeu_si128((__m128i*)(ptr + 16), s1);
    _mm_storeu_si128((__m128i*)(ptr + 32), s2);
 #else
    __m128i z = _mm_setzero_si128();
    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
@ -2042,15 +2075,31 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
 #endif
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 16), v1);
        _mm_stream_si128((__m128i*)(ptr + 32), v2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 16), v1);
        _mm_store_si128((__m128i*)(ptr + 32), v2);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
-#endif
+    }
 }
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
-                                const v_uint8x16& c, const v_uint8x16& d)
+                                const v_uint8x16& c, const v_uint8x16& d,
                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    // a0 a1 a2 a3 ....
    // b0 b1 b2 b3 ....
@ -2062,33 +2111,64 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
-    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
-    _mm_storeu_si128((__m128i*)ptr, v0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    {
-    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
+        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 16), v1);
        _mm_stream_si128((__m128i*)(ptr + 32), v2);
        _mm_stream_si128((__m128i*)(ptr + 48), v3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 16), v1);
        _mm_store_si128((__m128i*)(ptr + 32), v2);
        _mm_store_si128((__m128i*)(ptr + 48), v3);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
        _mm_storeu_si128((__m128i*)(ptr + 48), v3);
    }
 }
-inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0, t1;
+    __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
-    t0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
-    t1 = _mm_unpackhi_epi16(a.val, b.val);
+
-    _mm_storeu_si128((__m128i*)(ptr), t0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 8), v1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 8), v1);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
    }
 }
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
-                                const v_uint16x8& b,
+                                const v_uint16x8& b, const v_uint16x8& c,
-                                const v_uint16x8& c )
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
 #if CV_SSE4_1
-    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
-    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
@ -2096,10 +2176,6 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
    _mm_storeu_si128((__m128i*)ptr, v0);
    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
 #else
    __m128i z = _mm_setzero_si128();
    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
@ -2128,15 +2204,30 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
-
+#endif
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 8), v1);
        _mm_stream_si128((__m128i*)(ptr + 16), v2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 8), v1);
        _mm_store_si128((__m128i*)(ptr + 16), v2);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
-#endif
+    }
 }
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
-                                const v_uint16x8& c, const v_uint16x8& d)
+                                const v_uint16x8& c, const v_uint16x8& d,
                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    // a0 a1 a2 a3 ....
    // b0 b1 b2 b3 ....
@ -2148,27 +2239,58 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
-    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
-    _mm_storeu_si128((__m128i*)ptr, v0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    {
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 8), v1);
        _mm_stream_si128((__m128i*)(ptr + 16), v2);
        _mm_stream_si128((__m128i*)(ptr + 24), v3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 8), v1);
        _mm_store_si128((__m128i*)(ptr + 16), v2);
        _mm_store_si128((__m128i*)(ptr + 24), v3);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
        _mm_storeu_si128((__m128i*)(ptr + 24), v3);
    }
 }
-inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
-    __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
-    _mm_storeu_si128((__m128i*)ptr, t0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 4), v1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 4), v1);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
    }
 }
 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
-                                const v_uint32x4& c )
+                                const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
@ -2177,35 +2299,82 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint
    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
-    _mm_storeu_si128((__m128i*)ptr, v0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 4), v1);
        _mm_stream_si128((__m128i*)(ptr + 8), v2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 4), v1);
        _mm_store_si128((__m128i*)(ptr + 8), v2);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
        _mm_storeu_si128((__m128i*)(ptr + 8), v2);
    }
 }
 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
-                               const v_uint32x4& c, const v_uint32x4& d)
+                               const v_uint32x4& c, const v_uint32x4& d,
                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    v_uint32x4 t0, t1, t2, t3;
+    v_uint32x4 v0, v1, v2, v3;
-    v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
-    v_store(ptr, t0);
+
-    v_store(ptr + 4, t1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    v_store(ptr + 8, t2);
+    {
-    v_store(ptr + 12, t3);
+        _mm_stream_si128((__m128i*)(ptr), v0.val);
        _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
        _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
        _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0.val);
        _mm_store_si128((__m128i*)(ptr + 4), v1.val);
        _mm_store_si128((__m128i*)(ptr + 8), v2.val);
        _mm_store_si128((__m128i*)(ptr + 12), v3.val);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0.val);
        _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
        _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
        _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
    }
 }
 // 2-channel, float only
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    // a0 a1 a2 a3 ...
+    __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
-    // b0 b1 b2 b3 ...
+    __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
    __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
    __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
-    _mm_storeu_ps(ptr, u0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_ps((ptr + 4), u1);
+    {
        _mm_stream_ps(ptr, v0);
        _mm_stream_ps(ptr + 4, v1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_ps(ptr, v0);
        _mm_store_ps(ptr + 4, v1);
    }
    else
    {
        _mm_storeu_ps(ptr, v0);
        _mm_storeu_ps(ptr + 4, v1);
    }
 }
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
                               const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
    __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
@ -2217,13 +2386,29 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
    __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
-    _mm_storeu_ps(ptr + 0, v0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_ps(ptr, v0);
        _mm_stream_ps(ptr + 4, v1);
        _mm_stream_ps(ptr + 8, v2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_ps(ptr, v0);
        _mm_store_ps(ptr + 4, v1);
        _mm_store_ps(ptr + 8, v2);
    }
    else
    {
        _mm_storeu_ps(ptr, v0);
        _mm_storeu_ps(ptr + 4, v1);
        _mm_storeu_ps(ptr + 8, v2);
    }
 }
 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
-                               const v_float32x4& c, const v_float32x4& d)
+                               const v_float32x4& c, const v_float32x4& d,
                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
    __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
    __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
@ -2234,43 +2419,109 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
    __m128 v1 = _mm_unpackhi_ps(u0, u1);
    __m128 v3 = _mm_unpackhi_ps(u2, u3);
-    _mm_storeu_ps(ptr +  0, v0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm_stream_ps(ptr, v0);
        _mm_stream_ps(ptr + 4, v1);
        _mm_stream_ps(ptr + 8, v2);
        _mm_stream_ps(ptr + 12, v3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_ps(ptr, v0);
        _mm_store_ps(ptr + 4, v1);
        _mm_store_ps(ptr + 8, v2);
        _mm_store_ps(ptr + 12, v3);
    }
    else
    {
        _mm_storeu_ps(ptr, v0);
        _mm_storeu_ps(ptr + 4, v1);
        _mm_storeu_ps(ptr + 8, v2);
        _mm_storeu_ps(ptr + 12, v3);
    }
 }
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
-    _mm_storeu_si128((__m128i*)ptr, t0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    {
        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 2), v1);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 2), v1);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
    }
 }
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
                               const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
-    __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
+    __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
-    _mm_storeu_si128((__m128i*)ptr, t0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    {
-    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+        _mm_stream_si128((__m128i*)(ptr), v0);
        _mm_stream_si128((__m128i*)(ptr + 2), v1);
        _mm_stream_si128((__m128i*)(ptr + 4), v2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 2), v1);
        _mm_store_si128((__m128i*)(ptr + 4), v2);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
    }
 }
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
                               const v_uint64x2& c, const v_uint64x2& d,
                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
-    __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
-    __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
+    __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
-    _mm_storeu_si128((__m128i*)ptr, t0);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    {
-    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+        _mm_stream_si128((__m128i*)(ptr), v0);
-    _mm_storeu_si128((__m128i*)(ptr + 6), t3);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
        _mm_stream_si128((__m128i*)(ptr + 4), v2);
        _mm_stream_si128((__m128i*)(ptr + 6), v3);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm_store_si128((__m128i*)(ptr), v0);
        _mm_store_si128((__m128i*)(ptr + 2), v1);
        _mm_store_si128((__m128i*)(ptr + 4), v2);
        _mm_store_si128((__m128i*)(ptr + 6), v3);
    }
    else
    {
        _mm_storeu_si128((__m128i*)(ptr), v0);
        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
        _mm_storeu_si128((__m128i*)(ptr + 6), v3);
    }
 }
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@ -2298,27 +2549,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
    c0 = v_reinterpret_as_##suffix0(c1); \
    d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
 } \
 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
-                               const _Tpvec0& c0, const _Tpvec0& d0 ) \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
 }
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
 { st(a.val, 0, ptr); }                                                      \
 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
 { st_a(a.val, 0, ptr); }                                                    \
 inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
 { st_a(a.val, 0, ptr); }                                                    \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
 { if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
                                                _Tpvec& c, _Tpvec& d)        \
 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)   \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, ptr); }                                    \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
-                               const _Tpvec& b, const _Tpvec& c)             \
+                               const _Tpvec& b, const _Tpvec& c,             \
                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
-                                         const _Tpvec& c, const _Tpvec& d)   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@ -457,6 +457,18 @@ Returns empty string if feature is not defined
 */
 CV_EXPORTS_W String getHardwareFeatureName(int feature);
 /** @brief Returns list of CPU features enabled during compilation.
 Returned value is a string containing space separated list of CPU features with following markers:
 - no markers - baseline features
 - prefix `*` - features enabled in dispatcher
 - suffix `?` - features enabled but not available in HW
 Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
 */
 CV_EXPORTS std::string getCPUFeaturesLine();
 /** @brief Returns the number of logical CPUs available for the process.
 */
 CV_EXPORTS_W int getNumberOfCPUs();
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -1180,7 +1180,8 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
    CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
               op == CMP_NE || op == CMP_GE || op == CMP_GT );
-    if(_src1.empty() || _src2.empty())
+    CV_Assert(_src1.empty() == _src2.empty());
    if (_src1.empty() && _src2.empty())
    {
        _dst.release();
        return;
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -411,7 +411,8 @@ Mat& Mat::operator = (const Scalar& s)
 {
    CV_INSTRUMENT_REGION()
-    if (empty()) return *this;
+    if (this->empty())
        return *this;
    const Mat* arrays[] = { this };
    uchar* dptr;
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )
 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
-    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
-    static const v_float32 vminval = vx_setall_f32(minval);
+    const v_float32 vminval = vx_setall_f32(minval);
-    static const v_float32 vmaxval = vx_setall_f32(maxval);
+    const v_float32 vmaxval = vx_setall_f32(maxval);
-    static const v_float32 vA1 = vx_setall_f32((float)A1);
+    const v_float32 vA1 = vx_setall_f32((float)A1);
-    static const v_float32 vA2 = vx_setall_f32((float)A2);
+    const v_float32 vA2 = vx_setall_f32((float)A2);
-    static const v_float32 vA3 = vx_setall_f32((float)A3);
+    const v_float32 vA3 = vx_setall_f32((float)A3);
-    static const v_float32 vA4 = vx_setall_f32((float)A4);
+    const v_float32 vA4 = vx_setall_f32((float)A4);
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;
    for( ; i < n; i += VECSZ*2 )
@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )
 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    const v_float64 vprescale = vx_setall_f64(exp_prescale);
-    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
-    static const v_float64 vminval = vx_setall_f64(minval);
+    const v_float64 vminval = vx_setall_f64(minval);
-    static const v_float64 vmaxval = vx_setall_f64(maxval);
+    const v_float64 vmaxval = vx_setall_f64(maxval);
-
+
-    static const v_float64 vA1 = vx_setall_f64(A1);
+    const v_float64 vA1 = vx_setall_f64(A1);
-    static const v_float64 vA2 = vx_setall_f64(A2);
+    const v_float64 vA2 = vx_setall_f64(A2);
-    static const v_float64 vA3 = vx_setall_f64(A3);
+    const v_float64 vA3 = vx_setall_f64(A3);
-    static const v_float64 vA4 = vx_setall_f64(A4);
+    const v_float64 vA4 = vx_setall_f64(A4);
-    static const v_float64 vA5 = vx_setall_f64(A5);
+    const v_float64 vA5 = vx_setall_f64(A5);
-
+
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;
    for( ; i < n; i += VECSZ*2 )
@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )
 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    const v_float32 vln2 = vx_setall_f32((float)ln_2);
-    static const v_float32 v1 = vx_setall_f32(1.f);
+    const v_float32 v1 = vx_setall_f32(1.f);
-    static const v_float32 vshift = vx_setall_f32(-1.f/512);
+    const v_float32 vshift = vx_setall_f32(-1.f/512);
-    static const v_float32 vA0 = vx_setall_f32(A0);
+    const v_float32 vA0 = vx_setall_f32(A0);
-    static const v_float32 vA1 = vx_setall_f32(A1);
+    const v_float32 vA1 = vx_setall_f32(A1);
-    static const v_float32 vA2 = vx_setall_f32(A2);
+    const v_float32 vA2 = vx_setall_f32(A2);
    for( ; i < n; i += VECSZ )
    {
@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )
 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vln2 = vx_setall_f64(ln_2);
+    const v_float64 vln2 = vx_setall_f64(ln_2);
-    static const v_float64
+    const v_float64
        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -602,13 +602,13 @@ void Mat::pop_back(size_t nelems)
 void Mat::push_back_(const void* elem)
 {
-    int r = size.p[0];
+    size_t r = size.p[0];
    if( isSubmatrix() || dataend + step.p[0] > datalimit )
        reserve( std::max(r + 1, (r*3+1)/2) );
    size_t esz = elemSize();
    memcpy(data + r*step.p[0], elem, esz);
-    size.p[0] = r + 1;
+    size.p[0] = int(r + 1);
    dataend += step.p[0];
    uint64 tsz = size.p[0];
    for( int i = 1; i < dims; i++ )
@ -709,7 +709,8 @@ void Mat::resize(size_t nelems, const Scalar& s)
 void Mat::push_back(const Mat& elems)
 {
-    int r = size.p[0], delta = elems.size.p[0];
+    size_t r = size.p[0];
    size_t delta = elems.size.p[0];
    if( delta == 0 )
        return;
    if( this == &elems )
@ -726,7 +727,7 @@ void Mat::push_back(const Mat& elems)
    size.p[0] = elems.size.p[0];
    bool eq = size == elems.size;
-    size.p[0] = r;
+    size.p[0] = int(r);
    if( !eq )
        CV_Error(CV_StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length");
    if( type() != elems.type() )
@ -735,7 +736,7 @@ void Mat::push_back(const Mat& elems)
    if( isSubmatrix() || dataend + step.p[0]*delta > datalimit )
        reserve( std::max(r + delta, (r*3+1)/2) );
-    size.p[0] += delta;
+    size.p[0] += int(delta);
    dataend += step.p[0]*delta;
    //updateContinuityFlag(*this);
@ -744,7 +745,7 @@ void Mat::push_back(const Mat& elems)
        memcpy(data + r*step.p[0], elems.data, elems.total()*elems.elemSize());
    else
    {
-        Mat part = rowRange(r, r + delta);
+        Mat part = rowRange(int(r), int(r + delta));
        elems.copyTo(part);
    }
 }
--- a/modules/core/src/mean.cpp
+++ b/modules/core/src/mean.cpp
@ -766,11 +766,13 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
 {
    CV_INSTRUMENT_REGION()
    CV_Assert(!_src.empty());
    CV_Assert( _mask.empty() || _mask.type() == CV_8UC1 );
    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
               ocl_meanStdDev(_src, _mean, _sdv, _mask))
    Mat src = _src.getMat(), mask = _mask.getMat();
    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
    CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
               openvx_meanStdDev(src, _mean, _sdv, mask))
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@ -9,21 +9,58 @@
 namespace cv { namespace hal {
 #if CV_SIMD
 /*
  The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
  on IA there are instructions movntps and such to which
  v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
  Those instructions write directly into memory w/o touching cache
  that results in dramatic speed improvements, especially on
  large arrays (FullHD, 4K etc.).
  Those intrinsics require the destination address to be aligned
  by 16/32 bits (with SSE2 and AVX2, respectively).
  So we potentially split the processing into 3 stages:
  1) the optional prefix part [0:i0), where we use simple unaligned stores.
  2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
     But in some cases we have to use unaligned stores in this part.
  3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
     to process the remaining len - VECSZ elements.
  In principle there can be very poorly aligned data where there is no main part.
  For that we set i0=0 and use unaligned stores for the whole array.
 */
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
    int i, i0 = 0;
    const T* src0 = src[0];
    const T* src1 = src[1];
-    const int VECSZ = VecT::nlanes;
+    int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
    if( r != 0 )
    {
        mode = hal::STORE_UNALIGNED;
        if( r % cn == 0 && len > VECSZ )
            i0 = VECSZ - (r / cn);
    }
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
-            v_store_interleave(dst + i*cn, a, b);
+            v_store_interleave(dst + i*cn, a, b, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    else if( cn == 3 )
@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src2 = src[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
-            v_store_interleave(dst + i*cn, a, b, c);
+            v_store_interleave(dst + i*cn, a, b, c, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    else
@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src3 = src[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
            VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
-            v_store_interleave(dst + i*cn, a, b, c, d);
+            v_store_interleave(dst + i*cn, a, b, c, d, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    vx_cleanup();
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -2834,7 +2834,22 @@ extern "C" {
 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
 {
    try
    {
        ((cv::ocl::Kernel::Impl*)p)->finit(e);
    }
    catch (const cv::Exception& exc)
    {
        CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
    }
    catch (const std::exception& exc)
    {
        CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
    }
    catch (...)
    {
        CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
    }
 }
 }
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@ -511,8 +511,8 @@ static RandnScaleFunc randnScaleTab[] =
 void RNG::fill( InputOutputArray _mat, int disttype,
                InputArray _param1arg, InputArray _param2arg, bool saturateRange )
 {
-    if (_mat.empty())
+    CV_Assert(!_mat.empty());
-        return;
+
    Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
    int depth = mat.depth(), cn = mat.channels();
    AutoBuffer<double> _parambuf;
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@ -9,23 +9,46 @@
 namespace cv { namespace hal {
 #if CV_SIMD
 // see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
    int i, i0 = 0;
    T* dst0 = dst[0];
    T* dst1 = dst[1];
-    const int VECSZ = VecT::nlanes;
+    int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
    int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
    int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
    int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
    if( (r0|r1|r2|r3) != 0 )
    {
        mode = hal::STORE_UNALIGNED;
        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
            i0 = VECSZ - (r0 / cn);
    }
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a, b;
            v_load_deinterleave(src + i*cn, a, b);
-            v_store(dst0 + i, a);
+            v_store(dst0 + i, a, mode);
-            v_store(dst1 + i, b);
+            v_store(dst1 + i, b, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    else if( cn == 3 )
@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst2 = dst[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a, b, c;
            v_load_deinterleave(src + i*cn, a, b, c);
-            v_store(dst0 + i, a);
+            v_store(dst0 + i, a, mode);
-            v_store(dst1 + i, b);
+            v_store(dst1 + i, b, mode);
-            v_store(dst2 + i, c);
+            v_store(dst2 + i, c, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    else
@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst3 = dst[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
            {
                i = len - VECSZ;
                mode = hal::STORE_UNALIGNED;
            }
            VecT a, b, c, d;
            v_load_deinterleave(src + i*cn, a, b, c, d);
-            v_store(dst0 + i, a);
+            v_store(dst0 + i, a, mode);
-            v_store(dst1 + i, b);
+            v_store(dst1 + i, b, mode);
-            v_store(dst2 + i, c);
+            v_store(dst2 + i, c, mode);
-            v_store(dst3 + i, d);
+            v_store(dst3 + i, d, mode);
            if( i < i0 )
            {
                i = i0 - VECSZ;
                mode = hal::STORE_ALIGNED_NOCACHE;
            }
        }
    }
    vx_cleanup();
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -654,6 +654,27 @@ String getHardwareFeatureName(int feature)
    return name ? String(name) : String();
 }
 std::string getCPUFeaturesLine()
 {
    const int features[] = { CV_CPU_BASELINE_FEATURES, CV_CPU_DISPATCH_FEATURES };
    const int sz = sizeof(features) / sizeof(features[0]);
    std::string result;
    std::string prefix;
    for (int i = 1; i < sz; ++i)
    {
        if (features[i] == 0)
        {
            prefix = "*";
            continue;
        }
        if (i != 1) result.append(" ");
        result.append(prefix);
        result.append(getHWFeatureNameSafe(features[i]));
        if (!checkHardwareSupport(features[i])) result.append("?");
    }
    return result;
 }
 volatile bool useOptimizedFlag = true;
 void setUseOptimized( bool flag )
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -84,14 +84,11 @@ UMatData::~UMatData()
    allocatorFlags_ = 0;
    if (originalUMatData)
    {
        UMatData* u = originalUMatData;
        CV_XADD(&(u->urefcount), -1);
        CV_XADD(&(u->refcount), -1);
        bool showWarn = false;
-        if (u->refcount == 0)
+        UMatData* u = originalUMatData;
        bool zero_Ref = CV_XADD(&(u->refcount), -1) == 1;
        if (zero_Ref)
        {
            if (u->urefcount > 0)
                showWarn = true;
            // simulate Mat::deallocate
            if (u->mapcount != 0)
            {
@ -102,7 +99,10 @@ UMatData::~UMatData()
                // we don't do "map", so we can't do "unmap"
            }
        }
-        if (u->refcount == 0 && u->urefcount == 0) // oops, we need to free resources
+        bool zero_URef = CV_XADD(&(u->urefcount), -1) == 1;
        if (zero_Ref && !zero_URef)
            showWarn = true;
        if (zero_Ref && zero_URef) // oops, we need to free resources
        {
            showWarn = true;
            // simulate UMat::deallocate
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -2008,11 +2008,9 @@ TEST(Subtract, scalarc4_matc4)
 TEST(Compare, empty)
 {
    cv::Mat temp, dst1, dst2;
-    cv::compare(temp, temp, dst1, cv::CMP_EQ);
+    EXPECT_NO_THROW(cv::compare(temp, temp, dst1, cv::CMP_EQ));
    dst2 = temp > 5;
    EXPECT_TRUE(dst1.empty());
-    EXPECT_TRUE(dst2.empty());
+    EXPECT_THROW(dst2 = temp > 5, cv::Exception);
 }
 TEST(Compare, regression_8999)
@ -2020,9 +2018,7 @@ TEST(Compare, regression_8999)
    Mat_<double> A(4,1); A << 1, 3, 2, 4;
    Mat_<double> B(1,1); B << 2;
    Mat C;
-    ASSERT_ANY_THROW({
+    EXPECT_THROW(cv::compare(A, B, C, CMP_LT), cv::Exception);
        cv::compare(A, B, C, CMP_LT);
    });
 }
--- a/modules/core/test/test_concatenation.cpp
+++ b/modules/core/test/test_concatenation.cpp
@ -43,106 +43,35 @@
 namespace opencv_test { namespace {
-class Core_ConcatenationTest : public cvtest::BaseTest
+TEST(Core_Concatenation, empty)
 {
-public:
+    const Mat mat0x5(0,5, CV_8U, Scalar::all(1));
-    Core_ConcatenationTest(bool horizontal, bool firstEmpty, bool secondEmpty);
+    const Mat mat10x5(10,5, CV_8U, Scalar::all(1));
-protected:
+    const Mat mat20x5(20,5, CV_8U, Scalar::all(1));
    int prepare_test_case( int );
    void run_func();
    int validate_test_results( int );
-    Mat mat0x5;
+    const Mat mat5x0(5,0, CV_8U, Scalar::all(1));
-    Mat mat10x5;
+    const Mat mat5x10(5,10, CV_8U, Scalar::all(1));
-    Mat mat20x5;
+    const Mat mat5x20(5,20, CV_8U, Scalar::all(1));
    Mat mat5x0;
    Mat mat5x10;
    Mat mat5x20;
    Mat result;
-    bool horizontal;
+    cv::hconcat(mat5x0, mat5x0, result);
-    bool firstEmpty;
+    EXPECT_MAT_N_DIFF(result, mat5x0, 0);
-    bool secondEmpty;
+    cv::hconcat(mat5x0, mat5x10, result);
-
+    EXPECT_MAT_N_DIFF(result, mat5x10, 0);
-private:
+    cv::hconcat(mat5x10, mat5x0, result);
-    static bool areEqual(const Mat& m1, const Mat& m2);
+    EXPECT_MAT_N_DIFF(result, mat5x10, 0);
-
+    cv::hconcat(mat5x10, mat5x10, result);
-};
+    EXPECT_MAT_N_DIFF(result, mat5x20, 0);
-
+
-Core_ConcatenationTest::Core_ConcatenationTest(bool horizontal_, bool firstEmpty_, bool secondEmpty_)
+    cv::vconcat(mat0x5, mat0x5, result);
-    : horizontal(horizontal_)
+    EXPECT_MAT_N_DIFF(result, mat0x5, 0);
-    , firstEmpty(firstEmpty_)
+    cv::vconcat(mat0x5, mat10x5, result);
-    , secondEmpty(secondEmpty_)
+    EXPECT_MAT_N_DIFF(result, mat10x5, 0);
-{
+    cv::vconcat(mat10x5, mat0x5, result);
-    test_case_count = 1;
+    EXPECT_MAT_N_DIFF(result, mat10x5, 0);
-
+    cv::vconcat(mat10x5, mat10x5, result);
-    mat0x5 = Mat::ones(0,5, CV_8U);
+    EXPECT_MAT_N_DIFF(result, mat20x5, 0);
    mat10x5 = Mat::ones(10,5, CV_8U);
    mat20x5 = Mat::ones(20,5, CV_8U);
    mat5x0 = Mat::ones(5,0, CV_8U);
    mat5x10 = Mat::ones(5,10, CV_8U);
    mat5x20 = Mat::ones(5,20, CV_8U);
 }
 int Core_ConcatenationTest::prepare_test_case( int test_case_idx )
 {
    cvtest::BaseTest::prepare_test_case( test_case_idx );
    return 1;
 }
 void Core_ConcatenationTest::run_func()
 {
    if (horizontal)
    {
        cv::hconcat((firstEmpty ? mat5x0 : mat5x10),
                    (secondEmpty ? mat5x0 : mat5x10),
                    result);
    } else {
        cv::vconcat((firstEmpty ? mat0x5 : mat10x5),
                    (secondEmpty ? mat0x5 : mat10x5),
                    result);
    }
 }
 int Core_ConcatenationTest::validate_test_results( int )
 {
    Mat expected;
    if (firstEmpty && secondEmpty)
        expected = (horizontal ? mat5x0 : mat0x5);
    else if ((firstEmpty && !secondEmpty) || (!firstEmpty && secondEmpty))
        expected = (horizontal ? mat5x10 : mat10x5);
    else
        expected = (horizontal ? mat5x20 : mat20x5);
    if (areEqual(expected, result))
    {
        return cvtest::TS::OK;
    } else
    {
        ts->printf( cvtest::TS::LOG, "Concatenation failed");
        ts->set_failed_test_info( cvtest::TS::FAIL_MISMATCH );
    }
    return cvtest::TS::OK;
 }
 bool Core_ConcatenationTest::areEqual(const Mat &m1, const Mat &m2)
 {
    return m1.size() == m2.size()
            && m1.type() == m2.type()
            && countNonZero(m1 != m2) == 0;
 }
 TEST(Core_Concatenation, hconcat_empty_nonempty) { Core_ConcatenationTest test(true, true, false); test.safe_run(); }
 TEST(Core_Concatenation, hconcat_nonempty_empty) { Core_ConcatenationTest test(true, false, true); test.safe_run(); }
 TEST(Core_Concatenation, hconcat_empty_empty) { Core_ConcatenationTest test(true, true, true); test.safe_run(); }
 TEST(Core_Concatenation, vconcat_empty_nonempty) { Core_ConcatenationTest test(false, true, false); test.safe_run(); }
 TEST(Core_Concatenation, vconcat_nonempty_empty) { Core_ConcatenationTest test(false, false, true); test.safe_run(); }
 TEST(Core_Concatenation, vconcat_empty_empty) { Core_ConcatenationTest test(false, true, true); test.safe_run(); }
 }} // namespace
--- a/modules/core/test/test_intrin.avx2.cpp
+++ b/modules/core/test/test_intrin.avx2.cpp
@ -0,0 +1,5 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 #include "test_intrin.simd.hpp"
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -2,249 +2,101 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 #include "test_intrin.simd.hpp"
-#include "test_intrin_utils.hpp"
+#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
 #define CV_CPU_SIMD_FILENAME "test_intrin_utils.hpp"
 #define CV_CPU_DISPATCH_MODE FP16
 #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
-
+#define CV_CPU_DISPATCH_MODE AVX2
-using namespace cv;
+#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
 namespace opencv_test { namespace hal {
 using namespace CV_CPU_OPTIMIZATION_NAMESPACE;
-//=============  8-bit integer =====================================================================
+TEST(hal_intrin, uint8x16)
-
+{ test_hal_intrin_uint8(); }
 TEST(hal_intrin, uint8x16) {
    TheTest<v_uint8x16>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_expand_q()
        .test_addsub()
        .test_addsub_wrap()
        .test_cmp()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 }
-TEST(hal_intrin, int8x16) {
+TEST(hal_intrin, int8x16)
-    TheTest<v_int8x16>()
+{ test_hal_intrin_int8(); }
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_expand_q()
        .test_addsub()
        .test_addsub_wrap()
        .test_cmp()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_abs()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 }
-//============= 16-bit integer =====================================================================
+TEST(hal_intrin, uint16x8)
-
+{ test_hal_intrin_uint16(); }
 TEST(hal_intrin, uint16x8) {
    TheTest<v_uint16x8>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_addsub_wrap()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }
-TEST(hal_intrin, int16x8) {
+TEST(hal_intrin, int16x8)
-    TheTest<v_int16x8>()
+{ test_hal_intrin_int16(); }
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_addsub_wrap()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_dot_prod()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_abs()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }
-//============= 32-bit integer =====================================================================
+TEST(hal_intrin, int32x4)
-
+{ test_hal_intrin_int32(); }
 TEST(hal_intrin, uint32x4) {
    TheTest<v_uint32x4>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_transpose()
        ;
 }
-TEST(hal_intrin, int32x4) {
+TEST(hal_intrin, uint32x4)
-    TheTest<v_int32x4>()
+{ test_hal_intrin_uint32(); }
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_mul()
        .test_abs()
        .test_cmp()
        .test_popcount()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_float_cvt32()
        .test_float_cvt64()
        .test_transpose()
        ;
 }
-//============= 64-bit integer =====================================================================
+TEST(hal_intrin, uint64x2)
-
+{ test_hal_intrin_uint64(); }
 TEST(hal_intrin, uint64x2) {
    TheTest<v_uint64x2>()
        .test_loadstore()
        .test_addsub()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
-TEST(hal_intrin, int64x2) {
+TEST(hal_intrin, int64x2)
-    TheTest<v_int64x2>()
+{ test_hal_intrin_int64(); }
        .test_loadstore()
        .test_addsub()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
-//============= Floating point =====================================================================
+TEST(hal_intrin, float32x4)
-
+{ test_hal_intrin_float32(); }
 TEST(hal_intrin, float32x4) {
    TheTest<v_float32x4>()
        .test_loadstore()
        .test_interleave()
        .test_interleave_2channel()
        .test_addsub()
        .test_mul()
        .test_div()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
        .test_float_absdiff()
        .test_reduce()
        .test_mask()
        .test_unpack()
        .test_float_math()
        .test_float_cvt64()
        .test_matmul()
        .test_transpose()
        .test_reduce_sum4()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        ;
 }
-#if CV_SIMD128_64F
+TEST(hal_intrin, float64x2)
-TEST(hal_intrin, float64x2) {
+{ test_hal_intrin_float64(); }
    TheTest<v_float64x2>()
        .test_loadstore()
        .test_addsub()
        .test_mul()
        .test_div()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
        .test_float_absdiff()
        .test_mask()
        .test_unpack()
        .test_float_math()
        .test_float_cvt32()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
 #endif
-TEST(hal_intrin,float16)
+TEST(hal_intrin, float16x8)
 {
    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
-}}
+#define DISPATCH_SIMD_MODES AVX2
 #define DISPATCH_SIMD_NAME "SIMD256"
 #define DISPATCH_SIMD(fun)                              \
    do {                                                \
        CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES);  \
        throw SkipTestException(                        \
            "Unsupported hardware: "                    \
            DISPATCH_SIMD_NAME                          \
            " is not available"                         \
        );                                              \
    } while(0)
 TEST(hal_intrin256, uint8x32)
 { DISPATCH_SIMD(test_hal_intrin_uint8); }
 TEST(hal_intrin256, int8x32)
 { DISPATCH_SIMD(test_hal_intrin_int8); }
 TEST(hal_intrin256, uint16x16)
 { DISPATCH_SIMD(test_hal_intrin_uint16); }
 TEST(hal_intrin256, int16x16)
 { DISPATCH_SIMD(test_hal_intrin_int16); }
 TEST(hal_intrin256, uint32x8)
 { DISPATCH_SIMD(test_hal_intrin_uint32); }
 TEST(hal_intrin256, int32x8)
 { DISPATCH_SIMD(test_hal_intrin_int32); }
 TEST(hal_intrin256, uint64x4)
 { DISPATCH_SIMD(test_hal_intrin_uint64); }
 TEST(hal_intrin256, int64x4)
 { DISPATCH_SIMD(test_hal_intrin_int64); }
 TEST(hal_intrin256, float32x8)
 { DISPATCH_SIMD(test_hal_intrin_float32); }
 TEST(hal_intrin256, float64x4)
 { DISPATCH_SIMD(test_hal_intrin_float64); }
 TEST(hal_intrin256, float16x16)
 {
    if (!CV_CPU_HAS_SUPPORT_FP16)
        throw SkipTestException("Unsupported hardware: FP16 is not available");
    DISPATCH_SIMD(test_hal_intrin_float16);
 }
 }} // namespace
--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@ -9,7 +9,7 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 void test_hal_intrin_float16()
 {
-    TheTest<v_float16x8>()
+    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        ;
--- a/modules/core/test/test_intrin.simd.hpp
+++ b/modules/core/test/test_intrin.simd.hpp
@ -0,0 +1,296 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 #include "test_intrin_utils.hpp"
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 void test_hal_intrin_uint8();
 void test_hal_intrin_int8();
 void test_hal_intrin_uint16();
 void test_hal_intrin_int16();
 void test_hal_intrin_uint32();
 void test_hal_intrin_int32();
 void test_hal_intrin_uint64();
 void test_hal_intrin_int64();
 void test_hal_intrin_float32();
 void test_hal_intrin_float64();
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 //=============  8-bit integer =====================================================================
 void test_hal_intrin_uint8()
 {
    TheTest<v_uint8>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_expand_q()
        .test_addsub()
        .test_addsub_wrap()
        .test_cmp()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 #if CV_SIMD256
    TheTest<v_uint8>()
        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
        ;
 #endif
 }
 void test_hal_intrin_int8()
 {
    TheTest<v_int8>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_expand_q()
        .test_addsub()
        .test_addsub_wrap()
        .test_cmp()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_abs()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 }
 //============= 16-bit integer =====================================================================
 void test_hal_intrin_uint16()
 {
    TheTest<v_uint16>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_addsub_wrap()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }
 void test_hal_intrin_int16()
 {
    TheTest<v_int16>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_addsub_wrap()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_dot_prod()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_abs()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }
 //============= 32-bit integer =====================================================================
 void test_hal_intrin_uint32()
 {
    TheTest<v_uint32>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_mul()
        .test_mul_expand()
        .test_cmp()
        .test_shift<1>()
        .test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_transpose()
        ;
 }
 void test_hal_intrin_int32()
 {
    TheTest<v_int32>()
        .test_loadstore()
        .test_interleave()
        .test_expand()
        .test_addsub()
        .test_mul()
        .test_abs()
        .test_cmp()
        .test_popcount()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_min_max()
        .test_absdiff()
        .test_reduce()
        .test_mask()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_float_cvt32()
        .test_float_cvt64()
        .test_transpose()
        ;
 }
 //============= 64-bit integer =====================================================================
 void test_hal_intrin_uint64()
 {
    TheTest<v_uint64>()
        .test_loadstore()
        .test_addsub()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
 void test_hal_intrin_int64()
 {
    TheTest<v_int64>()
        .test_loadstore()
        .test_addsub()
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 }
 //============= Floating point =====================================================================
 void test_hal_intrin_float32()
 {
    TheTest<v_float32>()
        .test_loadstore()
        .test_interleave()
        .test_interleave_2channel()
        .test_addsub()
        .test_mul()
        .test_div()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
        .test_float_absdiff()
        .test_reduce()
        .test_mask()
        .test_unpack()
        .test_float_math()
        .test_float_cvt64()
        .test_matmul()
        .test_transpose()
        .test_reduce_sum4()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        ;
 #if CV_SIMD256
    TheTest<v_float32>()
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
        ;
 #endif
 }
 void test_hal_intrin_float64()
 {
 #if CV_SIMD_64F
    TheTest<v_float64>()
        .test_loadstore()
        .test_addsub()
        .test_mul()
        .test_div()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
        .test_float_absdiff()
        .test_mask()
        .test_unpack()
        .test_float_math()
        .test_float_cvt32()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
 #if CV_SIMD256
    TheTest<v_float64>()
        .test_extract<2>().test_extract<3>()
        .test_rotate<2>().test_rotate<3>()
        ;
 #endif //CV_SIMD256
 #endif
 }
 #if CV_FP16 && CV_SIMD_WIDTH > 16
 void test_hal_intrin_float16()
 {
    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        ;
 }
 #endif
 #endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} //namespace
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -13,6 +13,27 @@ void test_hal_intrin_float16();
 template <typename R> struct Data;
 template <int N> struct initializer;
 template <> struct initializer<64>
 {
    template <typename R> static R init(const Data<R> & d)
    {
        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[50], d[51], d[52], d[53],
        d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]);
    }
 };
 template <> struct initializer<32>
 {
    template <typename R> static R init(const Data<R> & d)
    {
        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]);
    }
 };
 template <> struct initializer<16>
 {
    template <typename R> static R init(const Data<R> & d)
@ -125,6 +146,17 @@ template <typename R> struct Data
    {
        return d + R::nlanes / 2;
    }
    LaneType sum(int s, int c)
    {
        LaneType res = 0;
        for (int i = s; i < s + c; ++i)
            res += d[i];
        return res;
    }
    LaneType sum()
    {
        return sum(0, R::nlanes);
    }
    bool operator==(const Data<R> & other) const
    {
        for (int i = 0; i < R::nlanes; ++i)
@ -147,13 +179,12 @@ template <typename R> struct Data
                return false;
        return true;
    }
    LaneType d[R::nlanes];
 };
 template<typename R> struct AlignedData
 {
-    Data<R> CV_DECL_ALIGNED(16) a; // aligned
+    Data<R> CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned
    char dummy;
    Data<R> u; // unaligned
 };
@ -207,22 +238,22 @@ template<typename R> struct TheTest
        AlignedData<R> out;
        // check if addresses are aligned and unaligned respectively
-        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
-        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
        // check some initialization methods
        R r1 = data.a;
-        R r2 = v_load(data.u.d);
+        R r2 = vx_load(data.u.d);
-        R r3 = v_load_aligned(data.a.d);
+        R r3 = vx_load_aligned(data.a.d);
        R r4(r2);
        EXPECT_EQ(data.a[0], r1.get0());
        EXPECT_EQ(data.u[0], r2.get0());
        EXPECT_EQ(data.a[0], r3.get0());
        EXPECT_EQ(data.u[0], r4.get0());
-        R r_low = v_load_low((LaneType*)data.u.d);
+        R r_low = vx_load_low((LaneType*)data.u.d);
        EXPECT_EQ(data.u[0], r_low.get0());
        v_store(out.u.d, r_low);
        for (int i = 0; i < R::nlanes/2; ++i)
@ -230,7 +261,7 @@ template<typename R> struct TheTest
            EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
        }
-        R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8));
+        R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2)));
        EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
        v_store(out.u.d, r_low_align8byte);
        for (int i = 0; i < R::nlanes/2; ++i)
@ -255,7 +286,7 @@ template<typename R> struct TheTest
        // check halves load correctness
        res.clear();
-        R r6 = v_load_halves(d.d, d.mid());
+        R r6 = vx_load_halves(d.d, d.mid());
        v_store(res.d, r6);
        EXPECT_EQ(d, res);
@ -270,17 +301,17 @@ template<typename R> struct TheTest
        }
        // reinterpret_as
-        v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
+        v_uint8 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
-        v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
+        v_int8 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
-        v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
+        v_uint16 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
-        v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
+        v_int16 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
-        v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
+        v_uint32 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
-        v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
+        v_int32 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
-        v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
+        v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
-        v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
+        v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
-        v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
+        v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
-#if CV_SIMD128_64F
+#if CV_SIMD_64F
-        v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
+        v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
 #endif
        return *this;
@ -357,7 +388,7 @@ template<typename R> struct TheTest
        Data<R> dataA;
        R a = dataA;
-        Data<Rx2> resB = v_load_expand(dataA.d);
+        Data<Rx2> resB = vx_load_expand(dataA.d);
        Rx2 c, d;
        v_expand(a, c, d);
@ -378,7 +409,7 @@ template<typename R> struct TheTest
    {
        typedef typename V_RegTraits<R>::q_reg Rx4;
        Data<R> data;
-        Data<Rx4> out = v_load_expand_q(data.d);
+        Data<Rx4> out = vx_load_expand_q(data.d);
        const int n = Rx4::nlanes;
        for (int i = 0; i < n; ++i)
            EXPECT_EQ(data[i], out[i]);
@ -610,7 +641,13 @@ template<typename R> struct TheTest
    TheTest & test_popcount()
    {
-        static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33};
+        static unsigned popcountTable[] = {
            0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33,
            35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81,
            83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123,
            128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172,
            176, 181, 186, 192, 193
        };
        Data<R> dataA;
        R a = dataA;
@ -918,7 +955,7 @@ template<typename R> struct TheTest
    TheTest & test_float_cvt32()
    {
-        typedef v_float32x4 Rt;
+        typedef v_float32 Rt;
        Data<R> dataA;
        dataA *= 1.1;
        R a = dataA;
@ -934,8 +971,8 @@ template<typename R> struct TheTest
    TheTest & test_float_cvt64()
    {
-#if CV_SIMD128_64F
+#if CV_SIMD_64F
-        typedef v_float64x2 Rt;
+        typedef v_float64 Rt;
        Data<R> dataA;
        dataA *= 1.1;
        R a = dataA;
@ -965,23 +1002,29 @@ template<typename R> struct TheTest
        R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
        Data<R> res = v_matmul(v, a, b, c, d);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < R::nlanes; i += 4)
        {
            for (int j = i; j < i + 4; ++j)
            {
-            LaneType val = dataV[0] * dataA[i]
+                LaneType val = dataV[i]     * dataA[j]
-                                      + dataV[1] * dataB[i]
+                             + dataV[i + 1] * dataB[j]
-                                      + dataV[2] * dataC[i]
+                             + dataV[i + 2] * dataC[j]
-                                      + dataV[3] * dataD[i];
+                             + dataV[i + 3] * dataD[j];
-            EXPECT_DOUBLE_EQ(val, res[i]);
+                EXPECT_COMPARE_EQ(val, res[j]);
            }
        }
        Data<R> resAdd = v_matmuladd(v, a, b, c, d);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < R::nlanes; i += 4)
        {
-            LaneType val = dataV[0] * dataA[i]
+            for (int j = i; j < i + 4; ++j)
-                                      + dataV[1] * dataB[i]
+            {
-                                      + dataV[2] * dataC[i]
+                LaneType val = dataV[i]     * dataA[j]
-                                      + dataD[i];
+                             + dataV[i + 1] * dataB[j]
-            EXPECT_DOUBLE_EQ(val, resAdd[i]);
+                             + dataV[i + 2] * dataC[j]
                             + dataD[j];
                EXPECT_COMPARE_EQ(val, resAdd[j]);
            }
        }
        return *this;
    }
@ -998,30 +1041,36 @@ template<typename R> struct TheTest
                       e, f, g, h);
        Data<R> res[4] = {e, f, g, h};
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < R::nlanes; i += 4)
        {
            for (int j = 0; j < 4; ++j)
            {
-            EXPECT_EQ(dataA[i], res[i][0]);
+                EXPECT_EQ(dataA[i + j], res[j][i]);
-            EXPECT_EQ(dataB[i], res[i][1]);
+                EXPECT_EQ(dataB[i + j], res[j][i + 1]);
-            EXPECT_EQ(dataC[i], res[i][2]);
+                EXPECT_EQ(dataC[i + j], res[j][i + 2]);
-            EXPECT_EQ(dataD[i], res[i][3]);
+                EXPECT_EQ(dataD[i + j], res[j][i + 3]);
            }
        }
        return *this;
    }
    TheTest & test_reduce_sum4()
    {
-        R a(0.1f, 0.02f, 0.003f, 0.0004f);
+        Data<R> dataA, dataB, dataC, dataD;
-        R b(1, 20, 300, 4000);
+        dataB *= 0.01f;
-        R c(10, 2, 0.3f, 0.04f);
+        dataC *= 0.001f;
-        R d(1, 2, 3, 4);
+        dataD *= 0.002f;
-        R sum = v_reduce_sum4(a, b, c, d);
+        R a = dataA, b = dataB, c = dataC, d = dataD;
        Data<R> res = v_reduce_sum4(a, b, c, d);
-        Data<R> res = sum;
+        for (int i = 0; i < R::nlanes; i += 4)
-        EXPECT_EQ(0.1234f, res[0]);
+        {
-        EXPECT_EQ(4321.0f, res[1]);
+            EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
-        EXPECT_EQ(12.34f, res[2]);
+            EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
-        EXPECT_EQ(10.0f, res[3]);
+            EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
            EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]);
        }
        return *this;
    }
@ -1032,14 +1081,14 @@ template<typename R> struct TheTest
        AlignedData<R> out;
        // check if addresses are aligned and unaligned respectively
-        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
-        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
        // check some initialization methods
        R r1 = data.u;
-        R r2 = v_load_f16(data.a.d);
+        R r2 = vx_load_f16(data.a.d);
        R r3(r2);
        EXPECT_EQ(data.u[0], r1.get0());
        EXPECT_EQ(data.a[0], r2.get0());
--- a/modules/core/test/test_rand.cpp
+++ b/modules/core/test/test_rand.cpp
@ -173,7 +173,6 @@ void Core_RandTest::run( int )
                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
                Mat aslice = arr[k].colRange(sz, sz + dsz);
                tested_rng.fill(aslice, dist_type, A, B);
                //printf("%d - %d\n", sz, sz + dsz);
            }
        }
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -85,12 +85,6 @@ else()
  set(sources_options EXCLUDE_OPENCL)
 endif()
 if(HAVE_INF_ENGINE)
  add_definitions(-DHAVE_INF_ENGINE=1)
  list(APPEND include_dirs ${INF_ENGINE_INCLUDE_DIRS})
  list(APPEND libs ${INF_ENGINE_LIBRARIES})
 endif()
 ocv_module_include_directories(${include_dirs})
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override")  # GCC
@ -98,9 +92,9 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override")  # Clang
 endif()
 ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
-ocv_create_module(${libs})
+ocv_create_module(${libs} ${INF_ENGINE_TARGET})
 ocv_add_samples()
-ocv_add_accuracy_tests()
+ocv_add_accuracy_tests(${INF_ENGINE_TARGET})
 ocv_add_perf_tests()
 ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
@ -120,9 +114,3 @@ if(BUILD_PERF_TESTS)
    endif()
  endif()
 endif()
 # Test Intel's Inference Engine models
 if(HAVE_INF_ENGINE AND TARGET opencv_test_dnn)
  ocv_target_include_directories(opencv_test_dnn PRIVATE ${INF_ENGINE_INCLUDE_DIRS})
  ocv_target_link_libraries(opencv_test_dnn LINK_PRIVATE ${INF_ENGINE_LIBRARIES})
 endif()
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -201,7 +201,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         *  @param[out] outputs allocated output blobs, which will store results of the computation.
         *  @param[out] internals allocated internal blobs
         */
-        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) = 0;
+        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
        /** @brief Given the @p input blobs, computes the output @p blobs.
         *  @param[in]  inputs  the input blobs.
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -44,7 +44,9 @@
 #include <opencv2/core.hpp>
 #include <opencv2/core/types_c.h>
 #include <iostream>
 #include <ostream>
 #include <sstream>
 namespace cv {
 namespace dnn {
@ -178,13 +180,25 @@ static inline MatShape concat(const MatShape& a, const MatShape& b)
    return c;
 }
-inline void print(const MatShape& shape, const String& name = "")
+static inline std::string toString(const MatShape& shape, const String& name = "")
 {
-    printf("%s: [", name.c_str());
+    std::ostringstream ss;
-    size_t i, n = shape.size();
+    if (!name.empty())
-    for( i = 0; i < n; i++ )
+        ss << name << ' ';
-        printf(" %d", shape[i]);
+    ss << '[';
-    printf(" ]\n");
+    for(size_t i = 0, n = shape.size(); i < n; ++i)
        ss << ' ' << shape[i];
    ss << " ]";
    return ss.str();
 }
 static inline void print(const MatShape& shape, const String& name = "")
 {
    std::cout << toString(shape, name) << std::endl;
 }
 static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
 {
    out << toString(shape);
    return out;
 }
 inline int clamp(int ax, int dims)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -74,6 +74,10 @@ static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSize
 #endif
 );
 // Additional checks (slowdowns execution!)
 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
 using std::vector;
 using std::map;
@ -2053,10 +2057,75 @@ struct Net::Impl
            {
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
                    std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
-                    layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
+                    std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
                    layer->forward(umat_inputBlobs,
                                   umat_outputBlobs,
-                                   OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers));
+                                   umat_internalBlobs);
                    if (DNN_CHECK_NAN_INF)
                    {
                        bool fail = false;
                        for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
                        {
                            UMat& u = umat_outputBlobs[i];
                            Mat m;
                            if (u.depth() == CV_16S) // FP16
                                convertFp16(u, m);
                            else
                                m = u.getMat(ACCESS_READ);
                            if (!checkRange(m))
                            {
                                std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
                                fail = true;
                            }
                            else if (!checkRange(m, true, NULL, -1e6, 1e6))
                            {
                                std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
                                fail = true;
                            }
                        }
                        if (fail)
                        {
                            for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
                            {
                                UMat& u = umat_inputBlobs[i];
                                Mat m;
                                if (u.depth() == CV_16S) // FP16
                                    convertFp16(u, m);
                                else
                                    m = u.getMat(ACCESS_READ);
                                std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
                            }
                            for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
                            {
                                UMat& u = umat_outputBlobs[i];
                                Mat m;
                                if (u.depth() == CV_16S) // FP16
                                    convertFp16(u, m);
                                else
                                    m = u.getMat(ACCESS_READ);
                                std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
                            }
                            for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
                            {
                                UMat& u = umat_internalBlobs[i];
                                Mat m;
                                if (u.depth() == CV_16S) // FP16
                                    convertFp16(u, m);
                                else
                                    m = u.getMat(ACCESS_READ);
                                std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
                            }
                            if (DNN_CHECK_NAN_INF_RAISE_ERROR)
                                CV_Assert(!fail);
                        }
                    }
                    OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
                }
                else
@ -2069,6 +2138,56 @@ struct Net::Impl
                    layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
                    if (DNN_CHECK_NAN_INF)
                    {
                        bool fail = false;
                        for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
                        {
                            const Mat& m = ld.outputBlobs[i];
                            if (!checkRange(m))
                            {
                                std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
                                fail = true;
                            }
                            else if (!checkRange(m, true, NULL, -1e6, 1e6))
                            {
                                std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
                                fail = true;
                            }
                        }
                        if (fail)
                        {
                            for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
                            {
                                const Mat* pM = ld.inputBlobs[i];
                                if (!pM)
                                {
                                    std::cout << "INPUT " << i << " is NULL" << std::endl;
                                    continue;
                                }
                                const Mat& m = *pM;
                                std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
                            }
                            for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
                            {
                                const Mat& m = ld.outputBlobs[i];
                                std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
                            }
                            for (size_t i = 0; i < ld.internals.size(); ++i)
                            {
                                const Mat& m = ld.internals[i];
                                std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
                            }
                            if (DNN_CHECK_NAN_INF_RAISE_ERROR)
                                CV_Assert(!fail);
                        }
                    }
                    for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
                    {
                        if (!ld.outputBlobsWrappers[i].empty())
@ -3071,6 +3190,14 @@ std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
    return outputs;
 }
 void Layer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
 {
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG_VALUE(name, "name", name.c_str());
    Layer::forward_fallback(inputs, outputs, internals);
 }
 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 {
    CV_TRACE_FUNCTION();
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -196,7 +196,7 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !_locPredTransposed;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized;
    }
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -411,9 +411,12 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
        if (_bboxesNormalized)
        {
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                       OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
        }
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }
@ -916,6 +919,7 @@ public:
        ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold);
        ieLayer->params["top_k"] = format("%d", _topK);
        ieLayer->params["keep_top_k"] = format("%d", _keepTopK);
        ieLayer->params["eta"] = "1.0";
        ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold);
        ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
        ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -135,10 +135,17 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        {
            if (preferableTarget == DNN_TARGET_MYRIAD)
                return type == MAX || type == AVE;
            else
                return type != STOCHASTIC;
        }
        else
            return backendId == DNN_BACKEND_OPENCV ||
                   backendId == DNN_BACKEND_HALIDE && haveHalide() &&
-               (type == MAX || type == AVE && !pad.width && !pad.height) ||
+                   (type == MAX || type == AVE && !pad.width && !pad.height);
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (type == MAX || type == AVE);
    }
 #ifdef HAVE_OPENCL
@ -192,8 +199,11 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
        if (type == MAX || type == AVE || type == STOCHASTIC)
        {
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
        }
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }
@ -238,22 +248,41 @@ public:
 #ifdef HAVE_INF_ENGINE
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Pooling";
        lp.precision = InferenceEngine::Precision::FP32;
-        std::shared_ptr<InferenceEngine::PoolingLayer> ieLayer(new InferenceEngine::PoolingLayer(lp));
+
-
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer;
-        ieLayer->_kernel_x = kernel.width;
+        if (type == MAX || type == AVE)
-        ieLayer->_kernel_y = kernel.height;
+        {
-        ieLayer->_stride_x = stride.width;
+            lp.type = "Pooling";
-        ieLayer->_stride_y = stride.height;
+            InferenceEngine::PoolingLayer* poolLayer = new InferenceEngine::PoolingLayer(lp);
-        ieLayer->_padding_x = pad.width;
+            poolLayer->_kernel_x = kernel.width;
-        ieLayer->_padding_y = pad.height;
+            poolLayer->_kernel_y = kernel.height;
-        ieLayer->_exclude_pad = type == AVE && padMode == "SAME";
+            poolLayer->_stride_x = stride.width;
-        ieLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
+            poolLayer->_stride_y = stride.height;
-        if (type == MAX)
+            poolLayer->_padding_x = pad.width;
-            ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::MAX;
+            poolLayer->_padding_y = pad.height;
-        else if (type == AVE)
+            poolLayer->_exclude_pad = type == AVE && padMode == "SAME";
-            ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::AVG;
+            poolLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
            poolLayer->_type = type == MAX ? InferenceEngine::PoolingLayer::PoolType::MAX :
                                             InferenceEngine::PoolingLayer::PoolType::AVG;
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(poolLayer);
        }
        else if (type == ROI)
        {
            lp.type = "ROIPooling";
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
            ieLayer->params["pooled_w"] = format("%d", pooledSize.width);
            ieLayer->params["pooled_h"] = format("%d", pooledSize.height);
            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
        }
        else if (type == PSROI)
        {
            lp.type = "PSROIPooling";
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
            ieLayer->params["output_dim"] = format("%d", psRoiOutChannels);
            ieLayer->params["group_size"] = format("%d", pooledSize.width);
            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
        }
        else
            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@ -6,6 +6,7 @@
 // Third party copyrights are property of their respective owners.
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_inf_engine.hpp"
 namespace cv { namespace dnn {
@ -16,14 +17,14 @@ public:
    {
        setParamsFrom(params);
-        uint32_t featStride = params.get<uint32_t>("feat_stride", 16);
+        featStride = params.get<uint32_t>("feat_stride", 16);
-        uint32_t baseSize = params.get<uint32_t>("base_size", 16);
+        baseSize = params.get<uint32_t>("base_size", 16);
        // uint32_t minSize = params.get<uint32_t>("min_size", 16);
-        uint32_t keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
+        keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
        keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300);
-        float nmsThreshold = params.get<float>("nms_thresh", 0.7);
+        nmsThreshold = params.get<float>("nms_thresh", 0.7);
-        DictValue ratios = params.get("ratio");
+        ratios = params.get("ratio");
-        DictValue scales = params.get("scale");
+        scales = params.get("scale");
        {
            LayerParams lp;
@ -83,6 +84,12 @@ public:
        }
    }
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && preferableTarget != DNN_TARGET_MYRIAD;
    }
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -312,6 +319,38 @@ public:
                outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0);
    }
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Proposal";
        lp.precision = InferenceEngine::Precision::FP32;
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
        ieLayer->params["base_size"] = format("%d", baseSize);
        ieLayer->params["feat_stride"] = format("%d", featStride);
        ieLayer->params["min_size"] = "16";
        ieLayer->params["nms_thresh"] = format("%f", nmsThreshold);
        ieLayer->params["post_nms_topn"] = format("%d", keepTopAfterNMS);
        ieLayer->params["pre_nms_topn"] = format("%d", keepTopBeforeNMS);
        if (ratios.size())
        {
            ieLayer->params["ratio"] = format("%f", ratios.get<float>(0));
            for (int i = 1; i < ratios.size(); ++i)
                ieLayer->params["ratio"] += format(",%f", ratios.get<float>(i));
        }
        if (scales.size())
        {
            ieLayer->params["scale"] = format("%f", scales.get<float>(0));
            for (int i = 1; i < scales.size(); ++i)
                ieLayer->params["scale"] += format(",%f", scales.get<float>(i));
        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
 private:
    // A first half of channels are background scores. We need only a second one.
    static Mat getObjectScores(const Mat& m)
@ -342,8 +381,10 @@ private:
    Ptr<PermuteLayer> deltasPermute;
    Ptr<PermuteLayer> scoresPermute;
-    uint32_t keepTopAfterNMS;
+    uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
    Mat fakeImageBlob;
    float nmsThreshold;
    DictValue ratios, scales;
 #ifdef HAVE_OPENCL
    UMat umat_fakeImageBlob;
 #endif
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@ -183,8 +183,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
            ocl::Kernel oclk_sto_pool_forward(
                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",
                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_
                ));
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -322,12 +322,32 @@ InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noex
    return InferenceEngine::StatusCode::OK;
 }
 InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(size_t size, InferenceEngine::ResponseDesc *responseDesc) noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return InferenceEngine::StatusCode::OK;
 }
 size_t InfEngineBackendNet::getBatchSize() const noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return 0;
 }
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
 InferenceEngine::StatusCode InfEngineBackendNet::AddExtension(const InferenceEngine::IShapeInferExtensionPtr &extension, InferenceEngine::ResponseDesc *resp) noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return InferenceEngine::StatusCode::OK;
 }
 InferenceEngine::StatusCode InfEngineBackendNet::reshape(const InferenceEngine::ICNNNetwork::InputShapes &inputShapes, InferenceEngine::ResponseDesc *resp) noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return InferenceEngine::StatusCode::OK;
 }
 #endif
 void InfEngineBackendNet::init(int targetId)
 {
    if (inputs.empty())
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -9,6 +9,8 @@
 #define __OPENCV_DNN_OP_INF_ENGINE_HPP__
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
 #include "opencv2/dnn.hpp"
 #ifdef HAVE_INF_ENGINE
 #if defined(__GNUC__) && __GNUC__ >= 5
@ -19,6 +21,17 @@
 #if defined(__GNUC__) && __GNUC__ >= 5
 //#pragma GCC diagnostic pop
 #endif
 #define INF_ENGINE_RELEASE_2018R1 2018010000
 #define INF_ENGINE_RELEASE_2018R2 2018020000
 #ifndef INF_ENGINE_RELEASE
 #warning("IE version have not been provided via command-line. Using 2018R2 by default")
 #define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R2
 #endif
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
 #endif  // HAVE_INF_ENGINE
 namespace cv { namespace dnn {
@ -86,8 +99,15 @@ public:
    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;
    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) noexcept;
    virtual size_t getBatchSize() const noexcept CV_OVERRIDE;
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) noexcept;
    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) noexcept;
 #endif
    void init(int targetId);
    void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers);
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@ -104,7 +104,7 @@ __kernel void
 #elif defined KERNEL_AVE_POOL
 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data,
+    const int nthreads, __global const Dtype* bottom_data,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)
@ -150,7 +150,7 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
 #elif defined KERNEL_STO_POOL
 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data,
+    const int nthreads, __global const Dtype* bottom_data,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1293,7 +1293,13 @@ void TFImporter::populateNet(Net dstNet)
                    if (!next_layers.empty())
                    {
                        int maximumLayerIdx = next_layers[0].second;
-                        ExcludeLayer(net, maximumLayerIdx, 0, false);
+
                        CV_Assert(net.node(maximumLayerIdx).input_size() == 2);
                        // The input from the Mul layer can also be at index 1.
                        int mulInputIdx = (net.node(maximumLayerIdx).input(0) == name) ? 0 : 1;
                        ExcludeLayer(net, maximumLayerIdx, mulInputIdx, false);
                        layers_to_ignore.insert(next_layers[0].first);
                        layerParams.set("negative_slope", scaleMat.at<float>(0));
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@ -938,6 +938,16 @@ struct TorchImporter
                layerParams.set("end", DictValue::arrayInt<int*>(&ends[0], 4));
                curModule->modules.push_back(newModule);
            }
            else if (nnName == "SpatialUpSamplingNearest")
            {
                readTorchTable(scalarParams, tensorParams);
                CV_Assert(scalarParams.has("scale_factor"));
                int scale_factor = scalarParams.get<int>("scale_factor");
                newModule->apiType = "Resize";
                layerParams.set("interpolation", "nearest");
                layerParams.set("zoom_factor", scale_factor);
                curModule->modules.push_back(newModule);
            }
            else
            {
                // Importer does not know how to map Torch's layer type to an OpenCV's one.
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -175,7 +175,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
               inp, "detection_out", "", l1, lInf, 0.25);
 }
@ -233,11 +233,8 @@ TEST_P(DNNTestNetwork, opencv_face_detector)
 {
    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    Size inpSize;
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        inpSize = Size(300, 300);
    Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
-    Mat inp = blobFromImage(img, 1.0, inpSize, Scalar(104.0, 177.0, 123.0), false, false);
+    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt",
               inp, "detection_out");
 }
@ -249,7 +246,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.008 : 0.0;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.07 : 0.0;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0731 : 0.0;
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
               inp, "detection_out", "", l1, lInf);
 }
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -51,6 +51,33 @@ static std::string _tf(TString filename)
    return (getOpenCVExtraDir() + "/dnn/") + filename;
 }
 class Test_Caffe_nets : public DNNTestLayer
 {
 public:
    void testFaster(const std::string& proto, const std::string& model, const Mat& ref,
                    double scoreDiff = 0.0, double iouDiff = 0.0)
    {
        checkBackend();
        Net net = readNetFromCaffe(findDataFile("dnn/" + proto, false),
                                   findDataFile("dnn/" + model, false));
        net.setPreferableBackend(backend);
        net.setPreferableTarget(target);
        Mat img = imread(findDataFile("dnn/dog416.png", false));
        resize(img, img, Size(800, 600));
        Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
        Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
        net.setInput(blob, "data");
        net.setInput(imInfo, "im_info");
        // Output has shape 1x1xNx7 where N - number of detections.
        // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
        Mat out = net.forward();
        scoreDiff = scoreDiff ? scoreDiff : default_l1;
        iouDiff = iouDiff ? iouDiff : default_lInf;
        normAssertDetections(ref, out, ("model name: " + model).c_str(), 0.8, scoreDiff, iouDiff);
    }
 };
 TEST(Test_Caffe, memory_read)
 {
    const string proto = findDataFile("dnn/bvlc_googlenet.prototxt", false);
@ -344,9 +371,15 @@ TEST(Reproducibility_GoogLeNet_fp16, Accuracy)
 }
 // https://github.com/richzhang/colorization
-TEST(Reproducibility_Colorization, Accuracy)
+TEST_P(Test_Caffe_nets, Colorization)
 {
-    const float l1 = 3e-5;
+    checkBackend();
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    const float l1 = 4e-4;
    const float lInf = 3e-3;
    Mat inp = blobFromNPY(_tf("colorization_inp.npy"));
@ -356,7 +389,8 @@ TEST(Reproducibility_Colorization, Accuracy)
    const string proto = findDataFile("dnn/colorization_deploy_v2.prototxt", false);
    const string model = findDataFile("dnn/colorization_release_v2.caffemodel", false);
    Net net = readNetFromCaffe(proto, model);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
    net.getLayer(net.getLayerId("class8_ab"))->blobs.push_back(kernel);
    net.getLayer(net.getLayerId("conv8_313_rh"))->blobs.push_back(Mat(1, 313, CV_32F, 2.606));
@ -447,39 +481,40 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
    )
 );
-TEST(Test_Caffe, FasterRCNN_and_RFCN)
+TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 {
-    std::string models[] = {"VGG16_faster_rcnn_final.caffemodel", "ZF_faster_rcnn_final.caffemodel",
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-                            "resnet50_rfcn_final.caffemodel"};
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
-    std::string protos[] = {"faster_rcnn_vgg16.prototxt", "faster_rcnn_zf.prototxt",
+        throw SkipTestException("");
-                            "rfcn_pascal_voc_resnet50.prototxt"};
+    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
    Mat refs[] = {(Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
                                           0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
-                                        0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166),
+                                           0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
-                  (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
+    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref);
-                                        0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
+}
                                        0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176),
                  (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
                                        0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16)};
    for (int i = 0; i < 3; ++i)
    {
        std::string proto = findDataFile("dnn/" + protos[i], false);
        std::string model = findDataFile("dnn/" + models[i], false);
-        Net net = readNetFromCaffe(proto, model);
+TEST_P(Test_Caffe_nets, FasterRCNN_zf)
-        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+{
-        Mat img = imread(findDataFile("dnn/dog416.png", false));
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
-        resize(img, img, Size(800, 600));
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-        Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
-        Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
+        throw SkipTestException("");
    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
                                           0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
                                           0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref);
 }
-        net.setInput(blob, "data");
+TEST_P(Test_Caffe_nets, RFCN)
-        net.setInput(imInfo, "im_info");
+{
-        // Output has shape 1x1xNx7 where N - number of detections.
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
-        // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-        Mat out = net.forward();
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
-        normAssertDetections(refs[i], out, ("model name: " + models[i]).c_str(), 0.8);
+        throw SkipTestException("");
-    }
+    static Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
                                           0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
    testFaster("rfcn_pascal_voc_resnet50.prototxt", "resnet50_rfcn_final.caffemodel", ref);
 }
 INSTANTIATE_TEST_CASE_P(/**/, Test_Caffe_nets, dnnBackendsAndTargets());
 }} // namespace
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -16,7 +16,7 @@ using namespace cv;
 using namespace cv::dnn;
 using namespace testing;
-static void test(Mat& input, Net& net, int backendId, int targetId)
+static void test(Mat& input, Net& net, Backend backendId, Target targetId, bool skipCheck = false)
 {
    DNNTestLayer::checkBackend(backendId, targetId);
    randu(input, -1.0f, 1.0f);
@ -29,16 +29,19 @@ static void test(Mat& input, Net& net, int backendId, int targetId)
    net.setPreferableTarget(targetId);
    Mat outputHalide = net.forward().clone();
    if (skipCheck)
        return;
    double l1, lInf;
    DNNTestLayer::getDefaultThresholds(backendId, targetId, &l1, &lInf);
    normAssert(outputDefault, outputHalide, "", l1, lInf);
 }
-static void test(LayerParams& params, Mat& input, int backendId, int targetId)
+static void test(LayerParams& params, Mat& input, Backend backendId, Target targetId, bool skipCheck = false)
 {
    Net net;
    net.addLayerToPrev(params.name, params.type, params);
-    test(input, net, backendId, targetId);
+    test(input, net, backendId, targetId, skipCheck);
 }
 static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
@ -101,16 +104,17 @@ TEST_P(Convolution, Accuracy)
    Size pad = get<4>(GetParam());
    Size dilation = get<5>(GetParam());
    bool hasBias = get<6>(GetParam());
-    int backendId = get<0>(get<7>(GetParam()));
+    Backend backendId = get<0>(get<7>(GetParam()));
-    int targetId = get<1>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
    bool skipCheck = false;
    if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV &&
        (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1))
-        throw SkipTestException("Skip unstable test");
+        skipCheck = true;
    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
    Mat weights(4, &sz[0], CV_32F);
@ -139,7 +143,9 @@ TEST_P(Convolution, Accuracy)
    }
    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
    Mat input(4, &inpSz[0], CV_32F);
-    test(lp, input, backendId, targetId);
+    test(lp, input, backendId, targetId, skipCheck);
    if (skipCheck)
        throw SkipTestException("Skip checks in unstable test");
 }
 INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine(
@ -171,8 +177,8 @@ TEST_P(Deconvolution, Accuracy)
    Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]);
    Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]);
    bool hasBias = get<6>(GetParam());
-    int backendId = get<0>(get<7>(GetParam()));
+    Backend backendId = get<0>(get<7>(GetParam()));
-    int targetId = get<1>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU &&
        dilation.width == 2 && dilation.height == 2)
        throw SkipTestException("");
@ -235,8 +241,8 @@ TEST_P(LRN, Accuracy)
    float bias = get<2>(GetParam())[2];
    bool normBySize = get<3>(GetParam());
    std::string nrmType = get<4>(GetParam());
-    int backendId = get<0>(get<5>(GetParam()));
+    Backend backendId = get<0>(get<5>(GetParam()));
-    int targetId = get<1>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");
@ -276,8 +282,8 @@ TEST_P(AvePooling, Accuracy)
    Size outSize = get<1>(GetParam());;  // Input size will be computed from parameters.
    Size kernel = get<2>(GetParam());
    Size stride = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
@ -317,8 +323,8 @@ TEST_P(MaxPooling, Accuracy)
    Size kernel = get<2>(GetParam());
    Size stride = get<3>(GetParam());
    Size pad = get<4>(GetParam());
-    int backendId = get<0>(get<5>(GetParam()));
+    Backend backendId = get<0>(get<5>(GetParam()));
-    int targetId = get<1>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
    LayerParams lp;
    lp.set("pool", "max");
@ -355,8 +361,8 @@ TEST_P(FullyConnected, Accuracy)
    Size inSize = get<1>(GetParam());
    int outChannels = get<2>(GetParam());
    bool hasBias = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");
@ -394,8 +400,8 @@ typedef TestWithParam<tuple<int,  tuple<Backend, Target> > > SoftMax;
 TEST_P(SoftMax, Accuracy)
 {
    int inChannels = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.type = "SoftMax";
    lp.name = "testLayer";
@ -457,7 +463,7 @@ TEST_P(Test_Halide_layers, MaxPoolUnpool)
 ////////////////////////////////////////////////////////////////////////////////
 static const int kNumChannels = 3;
-void testInPlaceActivation(LayerParams& lp, int backendId, int targetId)
+void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId)
 {
    EXPECT_FALSE(lp.name.empty());
@ -485,8 +491,8 @@ TEST_P(BatchNorm, Accuracy)
    bool hasWeights = get<0>(GetParam());
    bool hasBias = get<1>(GetParam());
    float epsilon = get<2>(GetParam());
-    int backendId = get<0>(get<3>(GetParam()));
+    Backend backendId = get<0>(get<3>(GetParam()));
-    int targetId = get<1>(get<3>(GetParam()));
+    Target targetId = get<1>(get<3>(GetParam()));
    LayerParams lp;
    lp.set("has_weight", hasWeights);
@ -518,8 +524,8 @@ typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
 TEST_P(ReLU, Accuracy)
 {
    float negativeSlope = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.set("negative_slope", negativeSlope);
@ -536,8 +542,8 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine(
 typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
 TEST_P(NoParamActivation, Accuracy)
 {
-    int backendId = get<0>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.type = get<0>(GetParam());
@ -555,8 +561,8 @@ TEST_P(Power, Accuracy)
    float power = get<0>(GetParam())[0];
    float scale = get<0>(GetParam())[1];
    float shift = get<0>(GetParam())[2];
-    int backendId = get<0>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.set("power", power);
@ -589,8 +595,8 @@ typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
 TEST_P(Scale, Accuracy)
 {
    bool hasBias = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.set("bias_term", hasBias);
@ -624,8 +630,8 @@ TEST_P(Concat, Accuracy)
 {
    Vec3i inSize = get<0>(GetParam());
    Vec3i numChannels = get<1>(GetParam());
-    int backendId = get<0>(get<2>(GetParam()));
+    Backend backendId = get<0>(get<2>(GetParam()));
-    int targetId = get<1>(get<2>(GetParam()));
+    Target targetId = get<1>(get<2>(GetParam()));
    Net net;
@ -692,8 +698,8 @@ TEST_P(Eltwise, Accuracy)
    std::string op = get<1>(GetParam());
    int numConv = get<2>(GetParam());
    bool weighted = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
    Net net;
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1205,14 +1205,6 @@ public:
        }
    }
    void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
        Layer::forward_fallback(inputs, outputs, internals);
    }
 private:
    int outWidth, outHeight, zoomFactor;
 };
@ -1225,7 +1217,7 @@ TEST_P(Test_Caffe_layers, DISABLED_Interp)  // requires patched protobuf (availa
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
-    // Test a cusom layer.
+    // Test a custom layer.
    CV_DNN_REGISTER_LAYER_CLASS(Interp, CustomInterpLayer);
    try
    {
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -230,6 +230,13 @@ TEST_P(Test_TensorFlow_layers, flatten)
    runTensorFlowNet("unfused_flatten_unknown_batch");
 }
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
    runTensorFlowNet("leaky_relu_order1");
    runTensorFlowNet("leaky_relu_order2");
    runTensorFlowNet("leaky_relu_order3");
 }
 TEST_P(Test_TensorFlow_layers, l2_normalize)
 {
    runTensorFlowNet("l2_normalize");
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -69,100 +69,119 @@ TEST(Torch_Importer, simple_read)
    ASSERT_FALSE(net.empty());
 }
-static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "",
+class Test_Torch_layers : public DNNTestLayer
                        bool check2ndBlob = false, bool isBinary = false)
 {
 public:
    void runTorchNet(const String& prefix, String outLayerName = "",
                     bool check2ndBlob = false, bool isBinary = false,
                     double l1 = 0.0, double lInf = 0.0)
    {
        String suffix = (isBinary) ? ".dat" : ".txt";
    Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
    ASSERT_FALSE(net.empty());
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(targetId);
        Mat inp, outRef;
        ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
        ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
        checkBackend(backend, target, &inp, &outRef);
        Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
        ASSERT_FALSE(net.empty());
        net.setPreferableBackend(backend);
        net.setPreferableTarget(target);
        if (outLayerName.empty())
            outLayerName = net.getLayerNames().back();
        net.setInput(inp);
        std::vector<Mat> outBlobs;
        net.forward(outBlobs, outLayerName);
-    normAssert(outRef, outBlobs[0]);
+        l1 = l1 ? l1 : default_l1;
        lInf = lInf ? lInf : default_lInf;
        normAssert(outRef, outBlobs[0], "", l1, lInf);
-    if (check2ndBlob)
+        if (check2ndBlob && backend != DNN_BACKEND_INFERENCE_ENGINE)
        {
            Mat out2 = outBlobs[1];
            Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
-        normAssert(out2, ref2);
+            normAssert(out2, ref2, "", l1, lInf);
        }
-}
+    }
-
+};
 typedef testing::TestWithParam<Target> Test_Torch_layers;
 TEST_P(Test_Torch_layers, run_convolution)
 {
-    runTorchNet("net_conv", GetParam(), "", false, true);
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    runTorchNet("net_conv", "", false, true);
 }
 TEST_P(Test_Torch_layers, run_pool_max)
 {
-    runTorchNet("net_pool_max", GetParam(), "", true);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        throw SkipTestException("");
    runTorchNet("net_pool_max", "", true);
 }
 TEST_P(Test_Torch_layers, run_pool_ave)
 {
-    runTorchNet("net_pool_ave", GetParam());
+    runTorchNet("net_pool_ave");
 }
 TEST_P(Test_Torch_layers, run_reshape)
 {
-    int targetId = GetParam();
+    runTorchNet("net_reshape");
-    runTorchNet("net_reshape", targetId);
+    runTorchNet("net_reshape_batch");
-    runTorchNet("net_reshape_batch", targetId);
+    runTorchNet("net_reshape_channels", "", false, true);
-    runTorchNet("net_reshape_single_sample", targetId);
+}
-    runTorchNet("net_reshape_channels", targetId, "", false, true);
+
 TEST_P(Test_Torch_layers, run_reshape_single_sample)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
        throw SkipTestException("");
    runTorchNet("net_reshape_single_sample", "", false, false,
                (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.0052 : 0.0);
 }
 TEST_P(Test_Torch_layers, run_linear)
 {
-    runTorchNet("net_linear_2d", GetParam());
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        throw SkipTestException("");
    runTorchNet("net_linear_2d");
 }
 TEST_P(Test_Torch_layers, run_concat)
 {
-    int targetId = GetParam();
+    runTorchNet("net_concat", "l5_torchMerge");
-    runTorchNet("net_concat", targetId, "l5_torchMerge");
+    runTorchNet("net_depth_concat", "", false, true, 0.0,
-    runTorchNet("net_depth_concat", targetId, "", false, true);
+                target == DNN_TARGET_OPENCL_FP16 ? 0.021 : 0.0);
 }
 TEST_P(Test_Torch_layers, run_deconv)
 {
-    runTorchNet("net_deconv", GetParam());
+    runTorchNet("net_deconv");
 }
 TEST_P(Test_Torch_layers, run_batch_norm)
 {
-    runTorchNet("net_batch_norm", GetParam(), "", false, true);
+    runTorchNet("net_batch_norm", "", false, true);
 }
 TEST_P(Test_Torch_layers, net_prelu)
 {
-    runTorchNet("net_prelu", GetParam());
+    runTorchNet("net_prelu");
 }
 TEST_P(Test_Torch_layers, net_cadd_table)
 {
-    runTorchNet("net_cadd_table", GetParam());
+    runTorchNet("net_cadd_table");
 }
 TEST_P(Test_Torch_layers, net_softmax)
 {
-    int targetId = GetParam();
+    runTorchNet("net_softmax");
-    runTorchNet("net_softmax", targetId);
+    runTorchNet("net_softmax_spatial");
    runTorchNet("net_softmax_spatial", targetId);
 }
 TEST_P(Test_Torch_layers, net_logsoftmax)
@ -173,40 +192,55 @@ TEST_P(Test_Torch_layers, net_logsoftmax)
 TEST_P(Test_Torch_layers, net_lp_pooling)
 {
-    int targetId = GetParam();
+    runTorchNet("net_lp_pooling_square", "", false, true);
-    runTorchNet("net_lp_pooling_square", targetId, "", false, true);
+    runTorchNet("net_lp_pooling_power", "", false, true);
    runTorchNet("net_lp_pooling_power", targetId, "", false, true);
 }
 TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
 {
-    runTorchNet("net_conv_gemm_lrn", GetParam(), "", false, true);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
    runTorchNet("net_conv_gemm_lrn", "", false, true,
                target == DNN_TARGET_OPENCL_FP16 ? 0.046 : 0.0,
                target == DNN_TARGET_OPENCL_FP16 ? 0.023 : 0.0);
 }
 TEST_P(Test_Torch_layers, net_inception_block)
 {
-    runTorchNet("net_inception_block", GetParam(), "", false, true);
+    runTorchNet("net_inception_block", "", false, true);
 }
 TEST_P(Test_Torch_layers, net_normalize)
 {
-    runTorchNet("net_normalize", GetParam(), "", false, true);
+    runTorchNet("net_normalize", "", false, true);
 }
 TEST_P(Test_Torch_layers, net_padding)
 {
-    int targetId = GetParam();
+    runTorchNet("net_padding", "", false, true);
-    runTorchNet("net_padding", targetId, "", false, true);
+    runTorchNet("net_spatial_zero_padding", "", false, true);
-    runTorchNet("net_spatial_zero_padding", targetId, "", false, true);
+    runTorchNet("net_spatial_reflection_padding", "", false, true);
    runTorchNet("net_spatial_reflection_padding", targetId, "", false, true);
 }
 TEST_P(Test_Torch_layers, net_non_spatial)
 {
-    runTorchNet("net_non_spatial", GetParam(), "", false, true);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    runTorchNet("net_non_spatial", "", false, true);
 }
 TEST_P(Test_Torch_layers, run_paralel)
 {
    if (backend != DNN_BACKEND_OPENCV || target != DNN_TARGET_CPU)
        throw SkipTestException("");
    runTorchNet("net_parallel", "l5_torchMerge");
 }
-INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, availableDnnTargets());
+TEST_P(Test_Torch_layers, net_residual)
 {
    runTorchNet("net_residual", "", false, true);
 }
 typedef testing::TestWithParam<Target> Test_Torch_nets;
@ -313,21 +347,6 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
 INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_nets, availableDnnTargets());
 // TODO: fix OpenCL and add to the rest of tests
 TEST(Torch_Importer, run_paralel)
 {
    runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge");
 }
 TEST(Torch_Importer, DISABLED_run_paralel)
 {
    runTorchNet("net_parallel", DNN_TARGET_OPENCL, "l5_torchMerge");
 }
 TEST(Torch_Importer, net_residual)
 {
    runTorchNet("net_residual", DNN_TARGET_CPU, "", false, true);
 }
 // Test a custom layer
 // https://github.com/torch/nn/blob/master/doc/convolution.md#nn.SpatialUpSamplingNearest
@ -374,17 +393,29 @@ public:
        }
    }
    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE {}
 private:
    int scale;
 };
-TEST(Torch_Importer, upsampling_nearest)
+TEST_P(Test_Torch_layers, upsampling_nearest)
 {
    // Test a custom layer.
    CV_DNN_REGISTER_LAYER_CLASS(SpatialUpSamplingNearest, SpatialUpSamplingNearestLayer);
-    runTorchNet("net_spatial_upsampling_nearest", DNN_TARGET_CPU, "", false, true);
+    try
    {
        runTorchNet("net_spatial_upsampling_nearest", "", false, true);
    }
    catch (...)
    {
        LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
        throw;
    }
    LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
    // Test an implemented layer.
    runTorchNet("net_spatial_upsampling_nearest", "", false, true);
 }
 INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, dnnBackendsAndTargets());
 }
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@ -307,8 +307,8 @@ icvLoadWindowPos( const char* name, CvRect& rect )
 {
    HKEY hkey;
    char szKey[1024];
-    strcpy( szKey, icvWindowPosRootKey );
+    strcpy_s( szKey, 1024, icvWindowPosRootKey );
-    strcat( szKey, name );
+    strcat_s( szKey, 1024, name );
    rect.x = rect.y = CW_USEDEFAULT;
    rect.width = rect.height = 320;
@ -368,8 +368,8 @@ icvSaveWindowPos( const char* name, CvRect rect )
    HKEY hkey;
    char szKey[1024];
    char rootKey[1024];
-    strcpy( szKey, icvWindowPosRootKey );
+    strcpy_s( szKey, 1024, icvWindowPosRootKey );
-    strcat( szKey, name );
+    strcat_s( szKey, 1024, name );
    if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS )
    {
@ -379,7 +379,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
        char oldestKey[1024];
        char currentKey[1024];
-        strcpy( rootKey, icvWindowPosRootKey );
+        strcpy_s( rootKey, 1024, icvWindowPosRootKey );
        rootKey[strlen(rootKey)-1] = '\0';
        if( RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS )
            //RegOpenKeyEx( HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS )
@ -398,7 +398,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
                oldestTime.dwLowDateTime > accesstime.dwLowDateTime) )
            {
                oldestTime = accesstime;
-                strcpy( oldestKey, currentKey );
+                strcpy_s( oldestKey, 1024, currentKey );
            }
        }
@ -1500,6 +1500,8 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
            rgn = CreateRectRgn(0, 0, wrc.right, wrc.bottom);
            rgn1 = CreateRectRgn(cr.left, cr.top, cr.right, cr.bottom);
            rgn2 = CreateRectRgn(tr.left, tr.top, tr.right, tr.bottom);
            CV_Assert(rgn != 0, rgn1 != 0, rgn2 != 0);
            ret = CombineRgn(rgn, rgn, rgn1, RGN_DIFF);
            ret = CombineRgn(rgn, rgn, rgn2, RGN_DIFF);
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -1771,7 +1771,7 @@ Corners in the image can be found as the local maxima of this response map.
 size as src .
@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
@param ksize Aperture parameter for the Sobel operator.
-@param k Harris detector free parameter. See the formula below.
+@param k Harris detector free parameter. See the formula above.
@param borderType Pixel extrapolation method. See #BorderTypes.
 */
 CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize,
--- a/modules/python/bindings/CMakeLists.txt
+++ b/modules/python/bindings/CMakeLists.txt
@ -20,8 +20,12 @@ endforeach()
 set(opencv_hdrs "")
 set(opencv_userdef_hdrs "")
 foreach(m ${OPENCV_PYTHON_MODULES})
-  ocv_list_filter(OPENCV_MODULE_${m}_HEADERS "${OPENCV_MODULE_${m}_LOCATION}/include" __hdrs)
+  foreach (hdr ${OPENCV_MODULE_${m}_HEADERS})
-  list(APPEND opencv_hdrs ${__hdrs})
+    ocv_is_subdir(is_sub "${OPENCV_MODULE_${m}_LOCATION}/include" "${hdr}")
    if(is_sub)
      list(APPEND opencv_hdrs "${hdr}")
    endif()
  endforeach()
  file(GLOB userdef_hdrs ${OPENCV_MODULE_${m}_LOCATION}/misc/python/pyopencv*.hpp)
  list(APPEND opencv_userdef_hdrs ${userdef_hdrs})
 endforeach(m)
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@ -379,10 +379,9 @@ struct TSParams
 class TS
 {
 public:
    // constructor(s) and destructor
    TS();
    virtual ~TS();
 public:
    enum
    {
@ -484,9 +483,6 @@ public:
        SKIPPED=1
    };
    // get file storage
    CvFileStorage* get_file_storage();
    // get RNG to generate random input data for a test
    RNG& get_rng() { return rng; }
@ -629,9 +625,6 @@ struct DefaultRngAuto
 void fillGradient(Mat& img, int delta = 5);
 void smoothBorder(Mat& img, const Scalar& color, int delta = 3);
 void printVersionInfo(bool useStdOut = true);
 // Utility functions
 void addDataSearchPath(const std::string& path);
@ -660,6 +653,13 @@ std::string findDataFile(const std::string& relative_path, bool required = true)
 */
 std::string findDataDirectory(const std::string& relative_path, bool required = true);
 // Test definitions
 class SystemInfoCollector : public testing::EmptyTestEventListener
 {
 private:
    virtual void OnTestProgramStart(const testing::UnitTest&);
 };
 #ifndef __CV_TEST_EXEC_ARGS
 #if defined(_MSC_VER) && (_MSC_VER <= 1400)
@ -671,15 +671,6 @@ std::string findDataDirectory(const std::string& relative_path, bool required =
 #endif
 #endif
 #ifdef HAVE_OPENCL
 namespace ocl {
 void dumpOpenCLDevice();
 }
 #define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
 #else
 #define TEST_DUMP_OCL_INFO
 #endif
 void parseCustomOptions(int argc, char **argv);
 #define CV_TEST_INIT0_NOOP (void)0
@ -696,8 +687,7 @@ int main(int argc, char **argv) \
    ts->init(resourcesubdir); \
    __CV_TEST_EXEC_ARGS(CV_TEST_INIT0_ ## INIT0) \
    ::testing::InitGoogleTest(&argc, argv); \
-    cvtest::printVersionInfo(); \
+    ::testing::UnitTest::GetInstance()->listeners().Append(new SystemInfoCollector); \
    TEST_DUMP_OCL_INFO \
    __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
    parseCustomOptions(argc, argv); \
    } \
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@ -637,15 +637,6 @@ void PrintTo(const Size& sz, ::std::ostream* os);
 #endif
 #endif
 #ifdef HAVE_OPENCL
 namespace cvtest { namespace ocl {
 void dumpOpenCLDevice();
 }}
 #define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
 #else
 #define TEST_DUMP_OCL_INFO
 #endif
 #define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...)	\
    CV_TRACE_FUNCTION(); \
@ -654,11 +645,10 @@ void dumpOpenCLDevice();
    ::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls), \
                           argc, argv); \
    ::testing::InitGoogleTest(&argc, argv); \
-    cvtest::printVersionInfo(); \
+    ::testing::UnitTest::GetInstance()->listeners().Append(new cvtest::SystemInfoCollector); \
    ::testing::Test::RecordProperty("cv_module_name", #modulename); \
    ::perf::TestBase::RecordRunParameters(); \
    __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
    TEST_DUMP_OCL_INFO \
    } \
    return RUN_ALL_TESTS();
--- a/modules/ts/src/ocl_test.cpp
+++ b/modules/ts/src/ocl_test.cpp
@ -43,25 +43,6 @@
 #include "opencv2/ts/ocl_test.hpp"
 #ifdef HAVE_OPENCL
 #define DUMP_CONFIG_PROPERTY(propertyName, propertyValue) \
    do { \
        std::stringstream ssName, ssValue;\
        ssName << propertyName;\
        ssValue << (propertyValue); \
        ::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
    } while (false)
 #define DUMP_MESSAGE_STDOUT(msg) \
    do { \
        std::cout << msg << std::endl; \
    } while (false)
 #include <opencv2/core/opencl/opencl_info.hpp>
 #endif // HAVE_OPENCL
 namespace cvtest {
 namespace ocl {
@ -69,13 +50,6 @@ using namespace cv;
 int test_loop_times = 1; // TODO Read from command line / environment
 #ifdef HAVE_OPENCL
 void dumpOpenCLDevice()
 {
    cv::dumpOpenCLInformation();
 }
 #endif // HAVE_OPENCL
 Mat TestUtils::readImage(const String &fileName, int flags)
 {
    return cv::imread(cvtest::TS::ptr()->get_data_path() + fileName, flags);
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -74,7 +74,26 @@
 # include <sys/stat.h>
 #endif
 #ifdef HAVE_OPENCL
 #define DUMP_CONFIG_PROPERTY(propertyName, propertyValue) \
    do { \
        std::stringstream ssName, ssValue;\
        ssName << propertyName;\
        ssValue << (propertyValue); \
        ::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
    } while (false)
 #define DUMP_MESSAGE_STDOUT(msg) \
    do { \
        std::cout << msg << std::endl; \
    } while (false)
 #include "opencv2/core/opencl/opencl_info.hpp"
 #endif // HAVE_OPENCL
 #include "opencv2/core/utility.hpp"
 #include "opencv_tests_config.hpp"
 namespace opencv_test {
@ -230,7 +249,6 @@ bool BaseTest::can_do_fast_forward()
 void BaseTest::safe_run( int start_from )
 {
    CV_TRACE_FUNCTION();
    read_params( ts->get_file_storage() );
    ts->update_context( 0, -1, true );
    ts->update_context( this, -1, true );
@ -552,8 +570,6 @@ void TS::set_gtest_status()
 }
 CvFileStorage* TS::get_file_storage() { return 0; }
 void TS::update_context( BaseTest* test, int test_case_idx, bool update_ts_context )
 {
    if( current_test_info.test != test )
@ -614,8 +630,11 @@ void TS::printf( int streams, const char* fmt, ... )
 }
-static TS ts;
+TS* TS::ptr()
-TS* TS::ptr() { return &ts; }
+{
    static TS ts;
    return &ts;
 }
 void fillGradient(Mat& img, int delta)
 {
@ -866,6 +885,65 @@ std::string findDataDirectory(const std::string& relative_path, bool required)
    return findData(relative_path, required, true);
 }
 inline static std::string getSnippetFromConfig(const std::string & start, const std::string & end)
 {
    const std::string buildInfo = cv::getBuildInformation();
    size_t pos1 = buildInfo.find(start);
    if (pos1 != std::string::npos)
    {
        pos1 += start.length();
        pos1 = buildInfo.find_first_not_of(" \t\n\r", pos1);
    }
    size_t pos2 = buildInfo.find(end, pos1);
    if (pos2 != std::string::npos)
    {
        pos2 = buildInfo.find_last_not_of(" \t\n\r", pos2);
    }
    if (pos1 != std::string::npos && pos2 != std::string::npos && pos1 < pos2)
    {
        return buildInfo.substr(pos1, pos2 - pos1 + 1);
    }
    return std::string();
 }
 inline static void recordPropertyVerbose(const std::string & property,
                                         const std::string & msg,
                                         const std::string & value,
                                         const std::string & build_value = std::string())
 {
    ::testing::Test::RecordProperty(property, value);
    std::cout << msg << ": " << (value.empty() ? std::string("N/A") : value) << std::endl;
    if (!build_value.empty())
    {
        ::testing::Test::RecordProperty(property + "_build", build_value);
        if (build_value != value)
            std::cout << "WARNING: build value differs from runtime: " << build_value << endl;
    }
 }
 #ifdef _DEBUG
 #define CV_TEST_BUILD_CONFIG "Debug"
 #else
 #define CV_TEST_BUILD_CONFIG "Release"
 #endif
 void SystemInfoCollector::OnTestProgramStart(const testing::UnitTest&)
 {
    std::cout << "CTEST_FULL_OUTPUT" << std::endl; // Tell CTest not to discard any output
    recordPropertyVerbose("cv_version", "OpenCV version", cv::getVersionString(), CV_VERSION);
    recordPropertyVerbose("cv_vcs_version", "OpenCV VCS version", getSnippetFromConfig("Version control:", "\n"));
    recordPropertyVerbose("cv_build_type", "Build type", getSnippetFromConfig("Configuration:", "\n"), CV_TEST_BUILD_CONFIG);
    recordPropertyVerbose("cv_compiler", "Compiler", getSnippetFromConfig("C++ Compiler:", "\n"));
    recordPropertyVerbose("cv_parallel_framework", "Parallel framework", cv::currentParallelFramework());
    recordPropertyVerbose("cv_cpu_features", "CPU features", cv::getCPUFeaturesLine());
 #ifdef HAVE_IPP
    recordPropertyVerbose("cv_ipp_version", "Intel(R) IPP version", cv::ipp::useIPP() ? cv::ipp::getIppVersion() :  "disabled");
 #endif
 #ifdef HAVE_OPENCL
    cv::dumpOpenCLInformation();
 #endif
 }
 } //namespace cvtest
 /* End of file. */
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@ -2973,143 +2973,6 @@ MatComparator::operator()(const char* expr1, const char* expr2,
    << "- " << expr2 << ":\n" << MatPart(m2part, border > 0 ? &loc : 0) << ".\n";
 }
 void printVersionInfo(bool useStdOut)
 {
    // Tell CTest not to discard any output
    if(useStdOut) std::cout << "CTEST_FULL_OUTPUT" << std::endl;
    ::testing::Test::RecordProperty("cv_version", CV_VERSION);
    if(useStdOut) std::cout << "OpenCV version: " << CV_VERSION << std::endl;
    std::string buildInfo( cv::getBuildInformation() );
    size_t pos1 = buildInfo.find("Version control");
    size_t pos2 = buildInfo.find('\n', pos1);
    if(pos1 != std::string::npos && pos2 != std::string::npos)
    {
        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
        ::testing::Test::RecordProperty("cv_vcs_version", ver);
        if (useStdOut) std::cout << "OpenCV VCS version: " << ver << std::endl;
    }
    pos1 = buildInfo.find("inner version");
    pos2 = buildInfo.find('\n', pos1);
    if(pos1 != std::string::npos && pos2 != std::string::npos)
    {
        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
        ::testing::Test::RecordProperty("cv_inner_vcs_version", ver);
        if(useStdOut) std::cout << "Inner VCS version: " << ver << std::endl;
    }
    const char * build_type =
 #ifdef _DEBUG
        "debug";
 #else
        "release";
 #endif
    ::testing::Test::RecordProperty("cv_build_type", build_type);
    if (useStdOut) std::cout << "Build type: " << build_type << std::endl;
    const char* parallel_framework = currentParallelFramework();
    if (parallel_framework) {
        ::testing::Test::RecordProperty("cv_parallel_framework", parallel_framework);
        if (useStdOut) std::cout << "Parallel framework: " << parallel_framework << std::endl;
    }
    std::string cpu_features;
 #if CV_POPCNT
    if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
 #endif
 #if CV_MMX
    if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
 #endif
 #if CV_SSE
    if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
 #endif
 #if CV_SSE2
    if (checkHardwareSupport(CV_CPU_SSE2)) cpu_features += " sse2";
 #endif
 #if CV_SSE3
    if (checkHardwareSupport(CV_CPU_SSE3)) cpu_features += " sse3";
 #endif
 #if CV_SSSE3
    if (checkHardwareSupport(CV_CPU_SSSE3)) cpu_features += " ssse3";
 #endif
 #if CV_SSE4_1
    if (checkHardwareSupport(CV_CPU_SSE4_1)) cpu_features += " sse4.1";
 #endif
 #if CV_SSE4_2
    if (checkHardwareSupport(CV_CPU_SSE4_2)) cpu_features += " sse4.2";
 #endif
 #if CV_AVX
    if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
 #endif
 #if CV_AVX2
    if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
 #endif
 #if CV_FMA3
    if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3";
 #endif
 #if CV_AVX_512F
    if (checkHardwareSupport(CV_CPU_AVX_512F)) cpu_features += " avx-512f";
 #endif
 #if CV_AVX_512BW
    if (checkHardwareSupport(CV_CPU_AVX_512BW)) cpu_features += " avx-512bw";
 #endif
 #if CV_AVX_512CD
    if (checkHardwareSupport(CV_CPU_AVX_512CD)) cpu_features += " avx-512cd";
 #endif
 #if CV_AVX_512DQ
    if (checkHardwareSupport(CV_CPU_AVX_512DQ)) cpu_features += " avx-512dq";
 #endif
 #if CV_AVX_512ER
    if (checkHardwareSupport(CV_CPU_AVX_512ER)) cpu_features += " avx-512er";
 #endif
 #if CV_AVX_512IFMA512
    if (checkHardwareSupport(CV_CPU_AVX_512IFMA512)) cpu_features += " avx-512ifma512";
 #endif
 #if CV_AVX_512PF
    if (checkHardwareSupport(CV_CPU_AVX_512PF)) cpu_features += " avx-512pf";
 #endif
 #if CV_AVX_512VBMI
    if (checkHardwareSupport(CV_CPU_AVX_512VBMI)) cpu_features += " avx-512vbmi";
 #endif
 #if CV_AVX_512VL
    if (checkHardwareSupport(CV_CPU_AVX_512VL)) cpu_features += " avx-512vl";
 #endif
 #if CV_NEON
    if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
 #endif
 #if CV_FP16
    if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16";
 #endif
 #if CV_VSX
    if (checkHardwareSupport(CV_CPU_VSX)) cpu_features += " VSX";
 #endif
    cpu_features.erase(0, 1); // erase initial space
    ::testing::Test::RecordProperty("cv_cpu_features", cpu_features);
    if (useStdOut) std::cout << "CPU features: " << cpu_features << std::endl;
 #ifdef HAVE_IPP
    const char * ipp_optimization = cv::ipp::useIPP()? "enabled" : "disabled";
    ::testing::Test::RecordProperty("cv_ipp_optimization", ipp_optimization);
    if (useStdOut) std::cout << "Intel(R) IPP optimization: " << ipp_optimization << std::endl;
    cv::String ippVer = cv::ipp::getIppVersion();
    ::testing::Test::RecordProperty("cv_ipp_version", ippVer);
    if(useStdOut) std::cout << "Intel(R) IPP version: " << ippVer.c_str() << std::endl;
 #endif
 }
 void threshold( const Mat& _src, Mat& _dst,
                            double thresh, double maxval, int thresh_type )
 {
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -905,7 +905,7 @@ public:
    /** @brief Writes the next video frame
-    @param image The written frame
+    @param image The written frame. In general, color images are expected in BGR format.
    The function/method writes the specified image to video file. It must have the same size as has
    been specified when opening the video writer.
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@ -811,6 +811,8 @@ void videoDevice::NukeDownstream(IBaseFilter *pBF){
    IEnumPins *pins = NULL;
    PIN_INFO pininfo;
    HRESULT hr = pBF->EnumPins(&pins);
    if (hr != S_OK || !pins)
        return;
    pins->Reset();
    while (hr == NOERROR)
    {
@ -838,7 +840,7 @@ void videoDevice::NukeDownstream(IBaseFilter *pBF){
            pP->Release();
        }
    }
-    if (pins) pins->Release();
+    pins->Release();
 }
@ -999,17 +1001,6 @@ videoDevice::~videoDevice(){
                                (pGraph) = 0;
    }
    //delete our pointers
    delete pDestFilter;
    delete pVideoInputFilter;
    delete pGrabberF;
    delete pGrabber;
    delete pControl;
    delete streamConf;
    delete pMediaEvent;
    delete pCaptureGraph;
    delete pGraph;
    DebugPrintOut("SETUP: Device %i disconnected and freed\n\n",myID);
 }
@ -1654,7 +1645,7 @@ bool videoInput::getVideoSettingFilter(int deviceID, long Property, long &min, l
    IAMVideoProcAmp *pAMVideoProcAmp = NULL;
    hr = VD->pVideoInputFilter->QueryInterface(IID_IAMVideoProcAmp, (void**)&pAMVideoProcAmp);
-    if(FAILED(hr)){
+    if(FAILED(hr) || !pAMVideoProcAmp){
        DebugPrintOut("setVideoSetting - QueryInterface Error\n");
 #if 0
        if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
@ -1676,7 +1667,7 @@ bool videoInput::getVideoSettingFilter(int deviceID, long Property, long &min, l
        hr = pAMVideoProcAmp->Get(Property, &currentValue, &flags);
    }
-    if(pAMVideoProcAmp)pAMVideoProcAmp->Release();
+    pAMVideoProcAmp->Release();
 #if 0
    if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
    if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL;
@ -1881,7 +1872,7 @@ bool videoInput::getVideoSettingCamera(int deviceID, long Property, long &min, l
    IAMCameraControl *pIAMCameraControl = NULL;
    hr = VD->pVideoInputFilter->QueryInterface(IID_IAMCameraControl, (void**)&pIAMCameraControl);
-    if(FAILED(hr)){
+    if(FAILED(hr) || !pIAMCameraControl){
        DebugPrintOut("setVideoSetting - QueryInterface Error\n");
 #if 0
        if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
@ -1902,7 +1893,7 @@ bool videoInput::getVideoSettingCamera(int deviceID, long Property, long &min, l
        hr = pIAMCameraControl->Get(Property, &currentValue, &flags);
    }
-    if(pIAMCameraControl)pIAMCameraControl->Release();
+    pIAMCameraControl->Release();
 #if 0
    if(VD->pVideoInputFilter)VD->pVideoInputFilter->Release();
    if(VD->pVideoInputFilter)VD->pVideoInputFilter = NULL;
@ -2595,7 +2586,7 @@ int videoInput::start(int deviceID, videoDevice *VD){
    //we do this because webcams don't have a preview mode
    hr = VD->pCaptureGraph->FindInterface(&CAPTURE_MODE, &MEDIATYPE_Video, VD->pVideoInputFilter, IID_IAMStreamConfig, (void **)&VD->streamConf);
-    if(FAILED(hr)){
+    if(FAILED(hr) || !VD->streamConf){
        DebugPrintOut("ERROR: Couldn't config the stream!\n");
        stopDevice(deviceID);
        return hr;
@ -2737,14 +2728,8 @@ int videoInput::start(int deviceID, videoDevice *VD){
    //lets try freeing our stream conf here too
    //this will fail if the device is already running
    if(VD->streamConf){
    VD->streamConf->Release();
    VD->streamConf = NULL;
    }else{
        DebugPrintOut("ERROR: connecting device - prehaps it is already being used?\n");
        stopDevice(deviceID);
        return S_FALSE;
    }
    //NULL RENDERER//
@ -3093,7 +3078,7 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
    IAMCrossbar *pXBar1 = NULL;
    HRESULT hr = pBuild->FindInterface(&LOOK_UPSTREAM_ONLY, NULL, pVidFilter,
            IID_IAMCrossbar, (void**)&pXBar1);
-    if (SUCCEEDED(hr))
+    if (SUCCEEDED(hr) && pXBar1)
    {
        bool foundDevice = false;
@ -3163,10 +3148,6 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
        //we were getting a crash otherwise
        //if(Crossbar)Crossbar->Release();
        //if(Crossbar)Crossbar = NULL;
        if(pXBar1)pXBar1->Release();
        if(pXBar1)pXBar1 = NULL;
    }else{
        DebugPrintOut("SETUP: You are a webcam or snazzy firewire cam! No Crossbar needed\n");
        return hr;
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@ -1224,7 +1224,11 @@ Ptr<IVideoCapture> cv::createGStreamerCapture(int index)
 class CvVideoWriter_GStreamer : public CvVideoWriter
 {
 public:
-    CvVideoWriter_GStreamer() { init(); }
+    CvVideoWriter_GStreamer()
        : pipeline(0), source(0), encodebin(0), file(0), buffer(0), input_pix_fmt(0),
          num_frames(0), framerate(0)
    {
    }
    virtual ~CvVideoWriter_GStreamer() CV_OVERRIDE { close(); }
    virtual bool open( const char* filename, int fourcc,
@ -1232,7 +1236,6 @@ public:
    virtual void close();
    virtual bool writeFrame( const IplImage* image ) CV_OVERRIDE;
 protected:
    void init();
    const char* filenameToMimetype(const char* filename);
    GstElement* pipeline;
    GstElement* source;
@ -1245,22 +1248,6 @@ protected:
    double framerate;
 };
 /*!
 * \brief CvVideoWriter_GStreamer::init
 * initialise all variables
 */
 void CvVideoWriter_GStreamer::init()
 {
    pipeline = NULL;
    source = NULL;
    encodebin = NULL;
    file = NULL;
    buffer = NULL;
    num_frames = 0;
    framerate = 0;
 }
 /*!
 * \brief CvVideoWriter_GStreamer::close
 * ends the pipeline by sending EOS and destroys the pipeline and all
@ -1282,17 +1269,19 @@ void CvVideoWriter_GStreamer::close()
        //wait for EOS to trickle down the pipeline. This will let all elements finish properly
        GstBus* bus = gst_element_get_bus(pipeline);
        GstMessage *msg = gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, (GstMessageType)(GST_MESSAGE_ERROR | GST_MESSAGE_EOS));
-        if (GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR)
+        if (!msg || GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR)
        {
            CV_WARN("Error during VideoWriter finalization\n");
            return;
        }
            if(msg != NULL)
            {
                gst_message_unref(msg);
                g_object_unref(G_OBJECT(bus));
            }
            return;
        }
        gst_message_unref(msg);
        g_object_unref(G_OBJECT(bus));
        status = gst_element_set_state (pipeline, GST_STATE_NULL);
        if (status == GST_STATE_CHANGE_ASYNC)
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@ -91,7 +91,7 @@ static bool pMFCreateDXGIDeviceManager_initialized = false;
 static FN_MFCreateDXGIDeviceManager pMFCreateDXGIDeviceManager = NULL;
 static void init_MFCreateDXGIDeviceManager()
 {
-    HMODULE h = LoadLibraryA("mfplat.dll");
+    HMODULE h = LoadLibraryExA("mfplat.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
    if (h)
    {
        pMFCreateDXGIDeviceManager = (FN_MFCreateDXGIDeviceManager)GetProcAddress(h, "MFCreateDXGIDeviceManager");
@ -1720,7 +1720,7 @@ bool CvCapture_MSMF::setProperty( int property_id, double value )
                return setTime(duration * value, true);
            break;
        case CV_CAP_PROP_POS_FRAMES:
-            if (getFramerate(nativeFormat) != 0)
+            if (std::fabs(getFramerate(nativeFormat)) > 0)
                return setTime(value  * 1e7 / getFramerate(nativeFormat), false);
            break;
        case CV_CAP_PROP_POS_MSEC:
@ -1978,7 +1978,17 @@ private:
 CvVideoWriter_MSMF::CvVideoWriter_MSMF():
    MF(Media_Foundation::getInstance()),
-    initiated(false)
+    videoWidth(0),
    videoHeight(0),
    fps(0),
    bitRate(0),
    frameSize(0),
    encodingFormat(),
    inputFormat(),
    streamIndex(0),
    initiated(false),
    rtStart(0),
    rtDuration(0)
 {
 }
--- a/modules/videoio/src/cap_vfw.cpp
+++ b/modules/videoio/src/cap_vfw.cpp
@ -377,8 +377,8 @@ LRESULT PASCAL CvCaptureCAM_VFW::frameCallback( HWND hWnd, VIDEOHDR* hdr )
    if (!hWnd) return FALSE;
    capture = (CvCaptureCAM_VFW*)capGetUserData(hWnd);
    if (!capture) return (LRESULT)FALSE;
    capture->hdr = hdr;
    return (LRESULT)TRUE;
 }
--- a/samples/cpp/create_mask.cpp
+++ b/samples/cpp/create_mask.cpp
@ -12,26 +12,18 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/core.hpp"
 #include <iostream>
 #include <stdlib.h>
 using namespace std;
 using namespace cv;
-Mat img0, img1, res1, final;
+Mat src, img1, mask, final;
 Point point;
 vector<Point> pts;
 int drag = 0;
 int numpts = 100;
 Point* pts = new Point[100];
 int var = 0;
 int flag = 0;
 int flag1 = 0;
 int minx,miny,maxx,maxy,lenx,leny;
 void mouseHandler(int, int, int, int, void*);
@ -40,16 +32,17 @@ void mouseHandler(int event, int x, int y, int, void*)
    if (event == EVENT_LBUTTONDOWN && !drag)
    {
-        if(flag1 == 0)
+        if (flag == 0)
        {
-            if(var==0)
+            if (var == 0)
-                img1 = img0.clone();
+                img1 = src.clone();
            point = Point(x, y);
-            circle(img1,point,2,Scalar(0, 0, 255),-1, 8, 0);
+            circle(img1, point, 2, Scalar(0, 0, 255), -1, 8, 0);
-            pts[var] = point;
+            pts.push_back(point);
            var++;
            drag  = 1;
-            if(var>1)
+
            if (var > 1)
                line(img1,pts[var-2], point, Scalar(0, 0, 255), 2, 8, 0);
            imshow("Source", img1);
@ -59,31 +52,18 @@ void mouseHandler(int event, int x, int y, int, void*)
    if (event == EVENT_LBUTTONUP && drag)
    {
        imshow("Source", img1);
        drag = 0;
    }
    if (event == EVENT_RBUTTONDOWN)
    {
        flag1 = 1;
        img1 = img0.clone();
        for(int i = var; i < numpts ; i++)
            pts[i] = point;
-        if(var!=0)
+    if (event == EVENT_RBUTTONDOWN)
    {
-            const Point* pts3[1] = {&pts[0]};
+        flag = 1;
-            polylines( img1, pts3, &numpts,1, 1, Scalar(0,0,0), 2, 8, 0);
+        img1 = src.clone();
        }
-        for(int i=0;i<var;i++)
+        if (var != 0)
        {
-            minx = min(minx,pts[i].x);
+            polylines( img1, pts, 1, Scalar(0,0,0), 2, 8, 0);
            maxx = max(maxx,pts[i].x);
            miny = min(miny,pts[i].y);
            maxy = max(maxy,pts[i].y);
        }
        lenx = maxx - minx;
        leny = maxy - miny;
        imshow("Source", img1);
    }
@ -91,71 +71,49 @@ void mouseHandler(int event, int x, int y, int, void*)
    if (event == EVENT_RBUTTONUP)
    {
        flag = var;
-
+        final = Mat::zeros(src.size(), CV_8UC3);
-        final = Mat::zeros(img0.size(),CV_8UC3);
+        mask = Mat::zeros(src.size(), CV_8UC1);
-        res1 = Mat::zeros(img0.size(),CV_8UC1);
+
-        const Point* pts4[1] = {&pts[0]};
+        vector<vector<Point> > vpts;
-
+        vpts.push_back(pts);
-        fillPoly(res1, pts4,&numpts, 1, Scalar(255, 255, 255), 8, 0);
+        fillPoly(mask, vpts, Scalar(255, 255, 255), 8, 0);
-        bitwise_and(img0, img0, final,res1);
+        bitwise_and(src, src, final, mask);
-        imshow("mask",res1);
+        imshow("Mask", mask);
-        imwrite("mask.png",res1);
+        imshow("Result", final);
        imshow("Source", img1);
    }
    if (event == EVENT_MBUTTONDOWN)
    {
-        for(int i = 0; i < numpts ; i++)
+        pts.clear();
        {
            pts[i].x=0;
            pts[i].y=0;
        }
        var = 0;
        flag1 = 0;
        minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
        imshow("Source", img0);
        drag = 0;
        flag = 0;
        imshow("Source", src);
    }
 }
-static void help()
+int main(int argc, char **argv)
 {
-    cout << "\nThis program demonstrates using mouse events"
+    CommandLineParser parser(argc, argv, "{@input | ../data/lena.jpg | input image}");
-        "\nCall:\n"
+    parser.about("This program demonstrates using mouse events\n");
-        "./create_mask <image_name>\n"
+    parser.printMessage();
-        "\n"
+    cout << "\n\tleft mouse button - set a point to create mask shape\n"
        "\tleft mouse button - set a point to create mask shape"
        "\n"
        "\tright mouse button - create mask from points\n"
-        "\tmiddle mouse button - reset\n" << endl;
+        "\tmiddle mouse button - reset\n";
-}
+    String input_image = parser.get<String>("@input");
-int main(int argc, char **argv)
+    src = imread(input_image);
-{
+
-    cv::CommandLineParser parser(argc, argv, "{@input | ../data/lena.jpg | input image}");
+    if (src.empty())
    help();
    string input_image = parser.get<string>("@input");
    if (input_image.empty())
        {
-        parser.printMessage();
+        printf("Error opening image: %s\n", input_image.c_str());
        parser.printErrors();
        return 0;
        }
-    Mat src = imread(input_image);
+    namedWindow("Source", WINDOW_AUTOSIZE);
    minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
    img0 = src;
    res1 = Mat::zeros(img0.size(),CV_8UC1);
    final = Mat::zeros(img0.size(),CV_8UC3);
    //////////// source image ///////////////////
    namedWindow("Source", 1);
    setMouseCallback("Source", mouseHandler, NULL);
-    imshow("Source", img0);
+    imshow("Source", src);
    waitKey(0);
    return 0;
--- a/samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
@ -0,0 +1,149 @@
 /**
 * @brief You will learn how to recover an out-of-focus image by Wiener filter
 * @author Karpushin Vladislav, karpushin@ngs.ru, https://github.com/VladKarpushin
 */
 #include <iostream>
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 using namespace cv;
 using namespace std;
 void help();
 void calcPSF(Mat& outputImg, Size filterSize, int R);
 void fftshift(const Mat& inputImg, Mat& outputImg);
 void filter2DFreq(const Mat& inputImg, Mat& outputImg, const Mat& H);
 void calcWnrFilter(const Mat& input_h_PSF, Mat& output_G, double nsr);
 const String keys =
 "{help h usage ? |             | print this message   }"
 "{image          |original.JPG | input image name     }"
 "{R              |53           | radius               }"
 "{SNR            |5200         | signal to noise ratio}"
 ;
 int main(int argc, char *argv[])
 {
    help();
    CommandLineParser parser(argc, argv, keys);
    if (parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }
    int R = parser.get<int>("R");
    int snr = parser.get<int>("SNR");
    string strInFileName = parser.get<String>("image");
    if (!parser.check())
    {
        parser.printErrors();
        return 0;
    }
    Mat imgIn;
    imgIn = imread(strInFileName, IMREAD_GRAYSCALE);
    if (imgIn.empty()) //check whether the image is loaded or not
    {
        cout << "ERROR : Image cannot be loaded..!!" << endl;
        return -1;
    }
    Mat imgOut;
 //! [main]
    // it needs to process even image only
    Rect roi = Rect(0, 0, imgIn.cols & -2, imgIn.rows & -2);
    //Hw calculation (start)
    Mat Hw, h;
    calcPSF(h, roi.size(), R);
    calcWnrFilter(h, Hw, 1.0 / double(snr));
    //Hw calculation (stop)
    // filtering (start)
    filter2DFreq(imgIn(roi), imgOut, Hw);
    // filtering (stop)
 //! [main]
    imgOut.convertTo(imgOut, CV_8U);
    normalize(imgOut, imgOut, 0, 255, NORM_MINMAX);
    imwrite("result.jpg", imgOut);
    return 0;
 }
 void help()
 {
    cout << "2018-07-12" << endl;
    cout << "DeBlur_v8" << endl;
    cout << "You will learn how to recover an out-of-focus image by Wiener filter" << endl;
 }
 //! [calcPSF]
 void calcPSF(Mat& outputImg, Size filterSize, int R)
 {
    Mat h(filterSize, CV_32F, Scalar(0));
    Point point(filterSize.width / 2, filterSize.height / 2);
    circle(h, point, R, 255, -1, 8);
    Scalar summa = sum(h);
    outputImg = h / summa[0];
 }
 //! [calcPSF]
 //! [fftshift]
 void fftshift(const Mat& inputImg, Mat& outputImg)
 {
    outputImg = inputImg.clone();
    int cx = outputImg.cols / 2;
    int cy = outputImg.rows / 2;
    Mat q0(outputImg, Rect(0, 0, cx, cy));
    Mat q1(outputImg, Rect(cx, 0, cx, cy));
    Mat q2(outputImg, Rect(0, cy, cx, cy));
    Mat q3(outputImg, Rect(cx, cy, cx, cy));
    Mat tmp;
    q0.copyTo(tmp);
    q3.copyTo(q0);
    tmp.copyTo(q3);
    q1.copyTo(tmp);
    q2.copyTo(q1);
    tmp.copyTo(q2);
 }
 //! [fftshift]
 //! [filter2DFreq]
 void filter2DFreq(const Mat& inputImg, Mat& outputImg, const Mat& H)
 {
    Mat planes[2] = { Mat_<float>(inputImg.clone()), Mat::zeros(inputImg.size(), CV_32F) };
    Mat complexI;
    merge(planes, 2, complexI);
    dft(complexI, complexI, DFT_SCALE);
    Mat planesH[2] = { Mat_<float>(H.clone()), Mat::zeros(H.size(), CV_32F) };
    Mat complexH;
    merge(planesH, 2, complexH);
    Mat complexIH;
    mulSpectrums(complexI, complexH, complexIH, 0);
    idft(complexIH, complexIH);
    split(complexIH, planes);
    outputImg = planes[0];
 }
 //! [filter2DFreq]
 //! [calcWnrFilter]
 void calcWnrFilter(const Mat& input_h_PSF, Mat& output_G, double nsr)
 {
    Mat h_PSF_shifted;
    fftshift(input_h_PSF, h_PSF_shifted);
    Mat planes[2] = { Mat_<float>(h_PSF_shifted.clone()), Mat::zeros(h_PSF_shifted.size(), CV_32F) };
    Mat complexI;
    merge(planes, 2, complexI);
    dft(complexI, complexI);
    split(complexI, planes);
    Mat denom;
    pow(abs(planes[0]), 2, denom);
    denom += nsr;
    divide(planes[0], denom, output_G);
 }
 //! [calcWnrFilter]
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -190,7 +190,7 @@ while cv.waitKey(1) < 0:
    net.setInput(blob)
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        frame = cv.resize(frame, (inpWidth, inpHeight))
-        net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info')
+        net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
    outs = net.forward(getOutputsNames(net))
    postprocess(frame, outs)