Merge remote-tracking branch 'upstream/3.4' into merge-3.4

7 years ago · 4d7d630e92
parent 4b2d1aaeea e82af627ed
commit 4d7d630e92
93 changed files with 4162 additions and 4606 deletions
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@ -1433,8 +1433,7 @@ inline int TEGRA_MORPHFREE(cvhalFilter2D *context)

 #define TEGRA_RESIZE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation) \
 ( \
-    /*bilinear interpolation disabled due to rounding accuracy issues*/ \
-    /*interpolation == CV_HAL_INTER_LINEAR ? \
+    interpolation == CV_HAL_INTER_LINEAR ? \
        CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeLinearOpenCVSupported(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), ((src_type >> CV_CN_SHIFT) + 1)) && \
        inv_scale_x > 0 && inv_scale_y > 0 && \
        (dst_width - 0.5)/inv_scale_x - 0.5 < src_width && (dst_height - 0.5)/inv_scale_y - 0.5 < src_height && \
@ -1442,7 +1441,7 @@ inline int TEGRA_MORPHFREE(cvhalFilter2D *context)
        std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \
            CAROTENE_NS::resizeLinearOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
                                            src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \
-            CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \
+            CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
    interpolation == CV_HAL_INTER_AREA ? \
        CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeAreaSupported(1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)) && \
        std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \
--- a/3rdparty/ittnotify/CMakeLists.txt
+++ b/3rdparty/ittnotify/CMakeLists.txt
@ -15,6 +15,8 @@ if(NOT WIN32)
  endif()
 endif()

+ocv_warnings_disable(CMAKE_C_FLAGS -Wimplicit-fallthrough)
+
 ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
 set(ITT_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")

--- a/3rdparty/libjasper/CMakeLists.txt
+++ b/3rdparty/libjasper/CMakeLists.txt
@ -27,6 +27,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-unini
                                   -Wno-unused-but-set-parameter -Wmissing-declarations -Wunused -Wshadow
                                   -Wsign-compare -Wstrict-overflow -Wpointer-compare
                                   -Wabsolute-value  # clang on Linux
+                                   -Wimplicit-fallthrough
 )
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wstrict-prototypes) # clang
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@ -32,7 +32,7 @@ if(CV_GCC OR CV_CLANG)
  set_source_files_properties(jcdctmgr.c PROPERTIES COMPILE_FLAGS "-O1")
 endif()

-ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wshadow -Wunused -Wshift-negative-value)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wshadow -Wunused -Wshift-negative-value -Wimplicit-fallthrough)
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4244 /wd4267) # vs2005

--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@ -63,7 +63,7 @@ endif(MSVC)
 add_library(${PNG_LIBRARY} STATIC ${lib_srcs} ${lib_hdrs})
 target_link_libraries(${PNG_LIBRARY} ${ZLIB_LIBRARIES})

-ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wimplicit-fallthrough)

 set_target_properties(${PNG_LIBRARY}
  PROPERTIES OUTPUT_NAME ${PNG_LIBRARY}
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@ -438,6 +438,7 @@ endif()
 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-prototypes -Wmissing-declarations -Wundef -Wunused -Wsign-compare
                                   -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast
                                   -Wmisleading-indentation
+                                   -Wimplicit-fallthrough
 )
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@ -37,6 +37,7 @@ endif()
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wunused-function -Wshadow -Wmaybe-uninitialized
    -Wmissing-prototypes  # clang
    -Wmissing-declarations # gcc
+    -Wimplicit-fallthrough
 )
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4244 /wd4267) # vs2005

--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@ -45,6 +45,7 @@ source_group("Src" FILES ${lib_srcs})
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -Wmissing-declarations -Wuninitialized -Wswitch -Wparentheses -Warray-bounds -Wextra
                                     -Wdeprecated-declarations -Wmisleading-indentation -Wdeprecated
                                     -Wsuggest-override -Winconsistent-missing-override
+                                     -Wimplicit-fallthrough
 )
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4018 /wd4099 /wd4100 /wd4101 /wd4127 /wd4189 /wd4245 /wd4305 /wd4389 /wd4512 /wd4701 /wd4702 /wd4706 /wd4800) # vs2005
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4334) # vs2005 Win64
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@ -21,6 +21,7 @@ else()
                                       -Wunused-function -Wunused-const-variable -Wshorten-64-to-32 -Wno-invalid-offsetof
                                       -Wenum-compare-switch
                                       -Wsuggest-override -Winconsistent-missing-override
+                                       -Wimplicit-fallthrough
  )
 endif()
 if(CV_ICC)
--- a/3rdparty/zlib/CMakeLists.txt
+++ b/3rdparty/zlib/CMakeLists.txt
@ -82,6 +82,7 @@ set_target_properties(${ZLIB_LIBRARY} PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
 ocv_warnings_disable(CMAKE_C_FLAGS -Wshorten-64-to-32 -Wattributes -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wshift-negative-value
    -Wundef  # _LFS64_LARGEFILE is not defined
    /wd4267  # MSVS 2015 (x64) + zlib 1.2.11
+    -Wimplicit-fallthrough
 )

 set_target_properties(${ZLIB_LIBRARY} PROPERTIES
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -392,7 +392,7 @@ endforeach()
 if(_add_native_flag)
  set(_varname "HAVE_CPU_NATIVE_SUPPORT")
  ocv_check_compiler_flag(CXX "-march=native" "${_varname}" "")
-  if(_varname)
+  if(${_varname})
    set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} -march=native")
  else()
    set(_varname "HAVE_CPU_HOST_SUPPORT")
@ -402,8 +402,8 @@ if(_add_native_flag)
      set(_flag "-xHost")
    endif()
    ocv_check_compiler_flag(CXX "${_flag}" "${_varname}" "")
-    if(_varname)
-      set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} ${flag}")
+    if(${_varname})
+      set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} ${_flag}")
    endif()
  endif()
 endif()
@ -703,16 +703,19 @@ macro(ocv_compiler_optimization_fill_cpu_config)
      set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
 #  define CV_TRY_${OPT} 1
+#  define CV_CPU_FORCE_${OPT} 1
 #  define CV_CPU_HAS_SUPPORT_${OPT} 1
 #  define CV_CPU_CALL_${OPT}(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_${OPT}_(fn, args) return (opt_${OPT}::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
 #  define CV_TRY_${OPT} 1
+#  define CV_CPU_FORCE_${OPT} 0
 #  define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
 #  define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args)
 #  define CV_CPU_CALL_${OPT}_(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args)
 #else
 #  define CV_TRY_${OPT} 0
+#  define CV_CPU_FORCE_${OPT} 0
 #  define CV_CPU_HAS_SUPPORT_${OPT} 0
 #  define CV_CPU_CALL_${OPT}(fn, args)
 #  define CV_CPU_CALL_${OPT}_(fn, args)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -119,7 +119,10 @@ if(CV_GCC OR CV_CLANG)
    add_extra_compiler_option(-Wno-delete-non-virtual-dtor)
    add_extra_compiler_option(-Wno-unnamed-type-template-args)
    add_extra_compiler_option(-Wno-comment)
-    add_extra_compiler_option(-Wno-implicit-fallthrough)
+    if(NOT OPENCV_SKIP_IMPLICIT_FALLTHROUGH
+        AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES "implicit-fallthrough")
+      add_extra_compiler_option(-Wimplicit-fallthrough=3)
+    endif()
    if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 7.2.0)
      add_extra_compiler_option(-Wno-strict-overflow) # Issue is fixed in GCC 7.2.1
    endif()
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@ -15,6 +15,7 @@ macro(ie_fail)
    return()
 endmacro()

+
 if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
    set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
    if(DEFINED ENV{INTEL_CVSDK_DIR})
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -504,7 +504,7 @@ macro(ocv_warnings_disable)
      foreach(var ${_flag_vars})
        foreach(warning ${_gxx_warnings})
          if(NOT warning MATCHES "^-Wno-")
-            string(REPLACE "${warning}" "" ${var} "${${var}}")
+            string(REGEX REPLACE "${warning}(=[^ ]*)?" "" ${var} "${${var}}")
            string(REPLACE "-W" "-Wno-" warning "${warning}")
          endif()
          ocv_check_flag_support(${var} "${warning}" _varname "")
--- a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
+++ b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
@ -43,7 +43,7 @@ endif()

 if(NOT DEFINED OpenCV_STATIC)
  # look for global setting
-  if(BUILD_SHARED_LIBS)
+  if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
    set(OpenCV_STATIC OFF)
  else()
    set(OpenCV_STATIC ON)
@ -57,6 +57,33 @@ if(NOT DEFINED OpenCV_CUDA)
  endif()
 endif()

+function(check_one_config RES)
+  set(${RES} "" PARENT_SCOPE)
+  if(NOT OpenCV_RUNTIME OR NOT OpenCV_ARCH)
+    return()
+  endif()
+  set(candidates)
+  if(OpenCV_STATIC)
+    list(APPEND candidates "${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
+  endif()
+  if(OpenCV_CUDA)
+    list(APPEND candidates "gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
+  endif()
+  if(OpenCV_CUDA AND OpenCV_STATIC)
+    list(APPEND candidates "gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
+  endif()
+  list(APPEND candidates "${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
+  foreach(c ${candidates})
+    set(p "${OpenCV_CONFIG_PATH}/${c}")
+    if(EXISTS "${p}/OpenCVConfig.cmake")
+      set(${RES} "${p}" PARENT_SCOPE)
+      return()
+    endif()
+  endforeach()
+endfunction()
+
+get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY)
+
 if(DEFINED OpenCV_ARCH AND DEFINED OpenCV_RUNTIME)
  # custom overridden values
 elseif(MSVC)
@ -82,6 +109,10 @@ elseif(MSVC)
    set(OpenCV_RUNTIME vc14)
  elseif(MSVC_VERSION MATCHES "^191[0-9]$")
    set(OpenCV_RUNTIME vc15)
+    check_one_config(has_VS2017)
+    if(NOT has_VS2017)
+      set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
+    endif()
  endif()
 elseif(MINGW)
  set(OpenCV_RUNTIME mingw)
@ -97,29 +128,14 @@ elseif(MINGW)
  endif()
 endif()

+check_one_config(OpenCV_LIB_PATH)
+
 if(NOT OpenCV_FIND_QUIETLY)
  message(STATUS "OpenCV ARCH: ${OpenCV_ARCH}")
  message(STATUS "OpenCV RUNTIME: ${OpenCV_RUNTIME}")
  message(STATUS "OpenCV STATIC: ${OpenCV_STATIC}")
 endif()

-get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH)
-if(OpenCV_RUNTIME AND OpenCV_ARCH)
-  if(OpenCV_STATIC AND EXISTS "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
-    if(OpenCV_CUDA AND EXISTS "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
-      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
-    else()
-      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
-    endif()
-  elseif(EXISTS "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib/OpenCVConfig.cmake")
-    if(OpenCV_CUDA AND EXISTS "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib/OpenCVConfig.cmake")
-      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
-    else()
-      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
-    endif()
-  endif()
-endif()
-
 if(OpenCV_LIB_PATH AND EXISTS "${OpenCV_LIB_PATH}/OpenCVConfig.cmake")
  include("${OpenCV_LIB_PATH}/OpenCVConfig.cmake")

--- a/modules/core/include/opencv2/core/affine.hpp
+++ b/modules/core/include/opencv2/core/affine.hpp
@ -445,7 +445,7 @@ void cv::Affine3<T>::rotation(const cv::Mat& data)
        rotation(_rvec);
    }
    else
-        CV_Assert(!"Input matrix can only be 3x3, 1x3 or 3x1");
+        CV_Error(Error::StsError, "Input matrix can only be 3x3, 1x3 or 3x1");
 }

 template<typename T> inline
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -454,6 +454,17 @@ for example:

 #define CV_Assert_1( expr ) if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ )

+//! @cond IGNORED
+#ifdef __OPENCV_BUILD
+#undef CV_Error
+#define CV_Error CV_ErrorNoReturn
+#undef CV_Error_
+#define CV_Error_ CV_ErrorNoReturn_
+#undef CV_Assert_1
+#define CV_Assert_1( expr ) if(!!(expr)) ; else cv::errorNoReturn( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ )
+#endif
+//! @endcond
+
 #endif // CV_STATIC_ANALYSIS

 #define CV_Assert_2( expr1, expr2 ) CV_Assert_1(expr1); CV_Assert_1(expr2)
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -2,16 +2,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
 #  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 1
 #  define CV_CPU_HAS_SUPPORT_SSE 1
 #  define CV_CPU_CALL_SSE(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSE_(fn, args) return (opt_SSE::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
 #  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 0
 #  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
 #  define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
 #  define CV_CPU_CALL_SSE_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
 #else
 #  define CV_TRY_SSE 0
+#  define CV_CPU_FORCE_SSE 0
 #  define CV_CPU_HAS_SUPPORT_SSE 0
 #  define CV_CPU_CALL_SSE(fn, args)
 #  define CV_CPU_CALL_SSE_(fn, args)
@ -20,16 +23,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
 #  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 1
 #  define CV_CPU_HAS_SUPPORT_SSE2 1
 #  define CV_CPU_CALL_SSE2(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSE2_(fn, args) return (opt_SSE2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
 #  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 0
 #  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
 #  define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
 #  define CV_CPU_CALL_SSE2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
 #else
 #  define CV_TRY_SSE2 0
+#  define CV_CPU_FORCE_SSE2 0
 #  define CV_CPU_HAS_SUPPORT_SSE2 0
 #  define CV_CPU_CALL_SSE2(fn, args)
 #  define CV_CPU_CALL_SSE2_(fn, args)
@ -38,16 +44,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
 #  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 1
 #  define CV_CPU_HAS_SUPPORT_SSE3 1
 #  define CV_CPU_CALL_SSE3(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSE3_(fn, args) return (opt_SSE3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
 #  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
 #  define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
 #  define CV_CPU_CALL_SSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
 #else
 #  define CV_TRY_SSE3 0
+#  define CV_CPU_FORCE_SSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSE3 0
 #  define CV_CPU_CALL_SSE3(fn, args)
 #  define CV_CPU_CALL_SSE3_(fn, args)
@ -56,16 +65,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
 #  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 1
 #  define CV_CPU_HAS_SUPPORT_SSSE3 1
 #  define CV_CPU_CALL_SSSE3(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSSE3_(fn, args) return (opt_SSSE3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
 #  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
 #  define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
 #  define CV_CPU_CALL_SSSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
 #else
 #  define CV_TRY_SSSE3 0
+#  define CV_CPU_FORCE_SSSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSSE3 0
 #  define CV_CPU_CALL_SSSE3(fn, args)
 #  define CV_CPU_CALL_SSSE3_(fn, args)
@ -74,16 +86,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
 #  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 1
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 1
 #  define CV_CPU_CALL_SSE4_1(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSE4_1_(fn, args) return (opt_SSE4_1::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
 #  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
 #  define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
 #  define CV_CPU_CALL_SSE4_1_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
 #else
 #  define CV_TRY_SSE4_1 0
+#  define CV_CPU_FORCE_SSE4_1 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 0
 #  define CV_CPU_CALL_SSE4_1(fn, args)
 #  define CV_CPU_CALL_SSE4_1_(fn, args)
@ -92,16 +107,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
 #  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 1
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 1
 #  define CV_CPU_CALL_SSE4_2(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_SSE4_2_(fn, args) return (opt_SSE4_2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
 #  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
 #  define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
 #  define CV_CPU_CALL_SSE4_2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
 #else
 #  define CV_TRY_SSE4_2 0
+#  define CV_CPU_FORCE_SSE4_2 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 0
 #  define CV_CPU_CALL_SSE4_2(fn, args)
 #  define CV_CPU_CALL_SSE4_2_(fn, args)
@ -110,16 +128,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
 #  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 1
 #  define CV_CPU_HAS_SUPPORT_POPCNT 1
 #  define CV_CPU_CALL_POPCNT(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_POPCNT_(fn, args) return (opt_POPCNT::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
 #  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 0
 #  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
 #  define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
 #  define CV_CPU_CALL_POPCNT_(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
 #else
 #  define CV_TRY_POPCNT 0
+#  define CV_CPU_FORCE_POPCNT 0
 #  define CV_CPU_HAS_SUPPORT_POPCNT 0
 #  define CV_CPU_CALL_POPCNT(fn, args)
 #  define CV_CPU_CALL_POPCNT_(fn, args)
@ -128,16 +149,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
 #  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 1
 #  define CV_CPU_HAS_SUPPORT_AVX 1
 #  define CV_CPU_CALL_AVX(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_AVX_(fn, args) return (opt_AVX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
 #  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 0
 #  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
 #  define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
 #  define CV_CPU_CALL_AVX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
 #else
 #  define CV_TRY_AVX 0
+#  define CV_CPU_FORCE_AVX 0
 #  define CV_CPU_HAS_SUPPORT_AVX 0
 #  define CV_CPU_CALL_AVX(fn, args)
 #  define CV_CPU_CALL_AVX_(fn, args)
@ -146,16 +170,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
 #  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 1
 #  define CV_CPU_HAS_SUPPORT_FP16 1
 #  define CV_CPU_CALL_FP16(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_FP16_(fn, args) return (opt_FP16::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
 #  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 0
 #  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
 #  define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
 #  define CV_CPU_CALL_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
 #else
 #  define CV_TRY_FP16 0
+#  define CV_CPU_FORCE_FP16 0
 #  define CV_CPU_HAS_SUPPORT_FP16 0
 #  define CV_CPU_CALL_FP16(fn, args)
 #  define CV_CPU_CALL_FP16_(fn, args)
@ -164,16 +191,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
 #  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 1
 #  define CV_CPU_HAS_SUPPORT_AVX2 1
 #  define CV_CPU_CALL_AVX2(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_AVX2_(fn, args) return (opt_AVX2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
 #  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 0
 #  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
 #  define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
 #  define CV_CPU_CALL_AVX2_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
 #else
 #  define CV_TRY_AVX2 0
+#  define CV_CPU_FORCE_AVX2 0
 #  define CV_CPU_HAS_SUPPORT_AVX2 0
 #  define CV_CPU_CALL_AVX2(fn, args)
 #  define CV_CPU_CALL_AVX2_(fn, args)
@ -182,16 +212,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
 #  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 1
 #  define CV_CPU_HAS_SUPPORT_FMA3 1
 #  define CV_CPU_CALL_FMA3(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_FMA3_(fn, args) return (opt_FMA3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
 #  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 0
 #  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
 #  define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
 #  define CV_CPU_CALL_FMA3_(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
 #else
 #  define CV_TRY_FMA3 0
+#  define CV_CPU_FORCE_FMA3 0
 #  define CV_CPU_HAS_SUPPORT_FMA3 0
 #  define CV_CPU_CALL_FMA3(fn, args)
 #  define CV_CPU_CALL_FMA3_(fn, args)
@ -200,16 +233,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX_512F
 #  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 1
 #  define CV_CPU_HAS_SUPPORT_AVX_512F 1
 #  define CV_CPU_CALL_AVX_512F(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_AVX_512F_(fn, args) return (opt_AVX_512F::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX_512F
 #  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 0
 #  define CV_CPU_HAS_SUPPORT_AVX_512F (cv::checkHardwareSupport(CV_CPU_AVX_512F))
 #  define CV_CPU_CALL_AVX_512F(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
 #  define CV_CPU_CALL_AVX_512F_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
 #else
 #  define CV_TRY_AVX_512F 0
+#  define CV_CPU_FORCE_AVX_512F 0
 #  define CV_CPU_HAS_SUPPORT_AVX_512F 0
 #  define CV_CPU_CALL_AVX_512F(fn, args)
 #  define CV_CPU_CALL_AVX_512F_(fn, args)
@ -218,16 +254,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
 #  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 1
 #  define CV_CPU_HAS_SUPPORT_AVX512_SKX 1
 #  define CV_CPU_CALL_AVX512_SKX(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_AVX512_SKX_(fn, args) return (opt_AVX512_SKX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX
 #  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 0
 #  define CV_CPU_HAS_SUPPORT_AVX512_SKX (cv::checkHardwareSupport(CV_CPU_AVX512_SKX))
 #  define CV_CPU_CALL_AVX512_SKX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
 #  define CV_CPU_CALL_AVX512_SKX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
 #else
 #  define CV_TRY_AVX512_SKX 0
+#  define CV_CPU_FORCE_AVX512_SKX 0
 #  define CV_CPU_HAS_SUPPORT_AVX512_SKX 0
 #  define CV_CPU_CALL_AVX512_SKX(fn, args)
 #  define CV_CPU_CALL_AVX512_SKX_(fn, args)
@ -236,16 +275,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
 #  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 1
 #  define CV_CPU_HAS_SUPPORT_NEON 1
 #  define CV_CPU_CALL_NEON(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_NEON_(fn, args) return (opt_NEON::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
 #  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 0
 #  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
 #  define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
 #  define CV_CPU_CALL_NEON_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
 #else
 #  define CV_TRY_NEON 0
+#  define CV_CPU_FORCE_NEON 0
 #  define CV_CPU_HAS_SUPPORT_NEON 0
 #  define CV_CPU_CALL_NEON(fn, args)
 #  define CV_CPU_CALL_NEON_(fn, args)
@ -254,16 +296,19 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
 #  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 1
 #  define CV_CPU_HAS_SUPPORT_VSX 1
 #  define CV_CPU_CALL_VSX(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_VSX_(fn, args) return (opt_VSX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX
 #  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 0
 #  define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX))
 #  define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
 #  define CV_CPU_CALL_VSX_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
 #else
 #  define CV_TRY_VSX 0
+#  define CV_CPU_FORCE_VSX 0
 #  define CV_CPU_HAS_SUPPORT_VSX 0
 #  define CV_CPU_CALL_VSX(fn, args)
 #  define CV_CPU_CALL_VSX_(fn, args)
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -202,6 +202,7 @@ Regular integers:
 |pack_u             | x |   | x |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
+|rotate (lanes)     | x | x | x | x | x | x |
 |cvt_flt32          |   |   |   |   |   | x |
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
@ -215,6 +216,7 @@ Big integers:
 |shift              | x | x |
 |logical            | x | x |
 |extract            | x | x |
+|rotate (lanes)     | x | x |

 Floating point:

@ -236,7 +238,8 @@ Floating point:
 |sqrt, abs          | x | x |
 |float math         | x | x |
 |transpose4x4       | x |   |
-
+|extract            | x | x |
+|rotate (lanes)     | x | x |

 @{ */

@ -1499,7 +1502,7 @@ Usage:
 v_int32x4 a, b, c;
 c = v_extract<2>(a, b);
@endcode
-For integer types only. */
+For all types. */
 template<int s, typename _Tp, int n>
 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -770,15 +770,7 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }

 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
@ -789,6 +781,29 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)

+#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
+
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { return _Tpvec(vld1q_##suffix(ptr)); } \
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -61,6 +61,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 struct v_uint8x16
 {
    typedef uchar lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 16 };

    v_uint8x16() : val(_mm_setzero_si128()) {}
@ -84,6 +85,7 @@ struct v_uint8x16
 struct v_int8x16
 {
    typedef schar lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 16 };

    v_int8x16() : val(_mm_setzero_si128()) {}
@ -107,6 +109,7 @@ struct v_int8x16
 struct v_uint16x8
 {
    typedef ushort lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 8 };

    v_uint16x8() : val(_mm_setzero_si128()) {}
@ -127,6 +130,7 @@ struct v_uint16x8
 struct v_int16x8
 {
    typedef short lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 8 };

    v_int16x8() : val(_mm_setzero_si128()) {}
@ -146,6 +150,7 @@ struct v_int16x8
 struct v_uint32x4
 {
    typedef unsigned lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 4 };

    v_uint32x4() : val(_mm_setzero_si128()) {}
@ -164,6 +169,7 @@ struct v_uint32x4
 struct v_int32x4
 {
    typedef int lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 4 };

    v_int32x4() : val(_mm_setzero_si128()) {}
@ -182,6 +188,7 @@ struct v_int32x4
 struct v_float32x4
 {
    typedef float lane_type;
+    typedef __m128 vector_type;
    enum { nlanes = 4 };

    v_float32x4() : val(_mm_setzero_ps()) {}
@ -200,6 +207,7 @@ struct v_float32x4
 struct v_uint64x2
 {
    typedef uint64 lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 2 };

    v_uint64x2() : val(_mm_setzero_si128()) {}
@ -220,6 +228,7 @@ struct v_uint64x2
 struct v_int64x2
 {
    typedef int64 lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 2 };

    v_int64x2() : val(_mm_setzero_si128()) {}
@ -240,6 +249,7 @@ struct v_int64x2
 struct v_float64x2
 {
    typedef double lane_type;
+    typedef __m128d vector_type;
    enum { nlanes = 2 };

    v_float64x2() : val(_mm_setzero_pd()) {}
@ -259,6 +269,7 @@ struct v_float64x2
 struct v_float16x4
 {
    typedef short lane_type;
+    typedef __m128i vector_type;
    enum { nlanes = 4 };

    v_float16x4() : val(_mm_setzero_si128()) {}
@ -275,6 +286,27 @@ struct v_float16x4
 };
 #endif

+namespace hal_sse_internal
+{
+    template <typename to_sse_type, typename from_sse_type>
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
+
+#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
+    template<> inline \
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
+    { return sse_cast_intrin(a); }
+
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd);
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP);
+}
+
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
@ -796,43 +828,75 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)

 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
 {
+#if CV_SSE4_1
+    return v_int8x16(_mm_min_epi8(a.val, b.val));
+#else
    __m128i delta = _mm_set1_epi8((char)-128);
    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
                                                       _mm_xor_si128(b.val, delta))));
+#endif
 }
 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
 {
+#if CV_SSE4_1
+    return v_int8x16(_mm_max_epi8(a.val, b.val));
+#else
    __m128i delta = _mm_set1_epi8((char)-128);
    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
                                                       _mm_xor_si128(b.val, delta))));
+#endif
 }
 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
 {
+#if CV_SSE4_1
+    return v_uint16x8(_mm_min_epu16(a.val, b.val));
+#else
    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+#endif
 }
 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
 {
+#if CV_SSE4_1
+    return v_uint16x8(_mm_max_epu16(a.val, b.val));
+#else
    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+#endif
 }
 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
 {
+#if CV_SSE4_1
+    return v_uint32x4(_mm_min_epu32(a.val, b.val));
+#else
    __m128i delta = _mm_set1_epi32((int)0x80000000);
    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+#endif
 }
 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
 {
+#if CV_SSE4_1
+    return v_uint32x4(_mm_max_epu32(a.val, b.val));
+#else
    __m128i delta = _mm_set1_epi32((int)0x80000000);
    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+#endif
 }
 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_int32x4(_mm_min_epi32(a.val, b.val));
+#else
    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+#endif
 }
 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_int32x4(_mm_max_epi32(a.val, b.val));
+#else
    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+#endif
 }

 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
@ -1030,31 +1094,116 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)

+namespace hal_sse_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_half = (imm == 8),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+    class v_sse_palignr_u8_class;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, true, false, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, true, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, true, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i&, const __m128i& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, false, true>
+    {
+#if CV_SSSE3
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_alignr_epi8(b, a, imm);
+        }
+#else
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            enum { imm2 = (sizeof(__m128i) - imm) };
+            return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
+        }
+#endif
+    };
+
+    template <int imm>
+    inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
+        return v_sse_palignr_u8_class<imm>()(a, b);
+    }
+}
+
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec &a)
 {
-    enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT));
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_srli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
 }
+
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_left(const _Tpvec &a)
 {
-    enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT));
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_slli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
 }
+
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
 {
-    enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
-    enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2)));
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(a.val),
+            v_sse_reinterpret_as<__m128i>(b.val))));
 }
+
 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
 {
-    enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
-    enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
-    return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2)));
+    using namespace hal_sse_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(b.val),
+            v_sse_reinterpret_as<__m128i>(a.val))));
 }

 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
@ -1371,12 +1520,7 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 {
-    const int w = sizeof(typename _Tpvec::lane_type);
-    const int n = _Tpvec::nlanes;
-    __m128i ra, rb;
-    ra = _mm_srli_si128(a.val, s*w);
-    rb = _mm_slli_si128(b.val, (n-s)*w);
-    return _Tpvec(_mm_or_si128(ra, rb));
+    return v_rotate_right<s>(a, b);
 }

 inline v_int32x4 v_round(const v_float32x4& a)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -562,9 +562,10 @@ OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
-
+OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)

 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -104,13 +104,13 @@ namespace cv { namespace cuda {

 #ifndef HAVE_CUDA

-static inline void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without CUDA support"); }
+static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without CUDA support"); }

 #else // HAVE_CUDA

 #define nppSafeSetStream(oldStream, newStream) { if(oldStream != newStream) { cudaStreamSynchronize(oldStream); nppSetStream(newStream); } }

-static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }
+static inline CV_NORETURN void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }

 namespace cv { namespace cuda
 {
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@ -184,6 +184,8 @@ T* allocSingleton(size_t count) { return static_cast<T*>(fastMalloc(sizeof(T) *
 #define IPP_DISABLE_HAAR                1 // improper integration/results
 #define IPP_DISABLE_HOUGH               1 // improper integration/results

+#define IPP_DISABLE_GAUSSIANBLUR_PARALLEL 1 // not supported (2017u2 / 2017u3)
+
 // Temporary disabled named IPP region. Performance
 #define IPP_DISABLE_PERF_COPYMAKE       1 // performance variations
 #define IPP_DISABLE_PERF_LUT            1 // there are no performance benefits (PR #2653)
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -1699,6 +1699,8 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
 {
    CV_INSTRUMENT_REGION()

+    CV_Assert(! _src.empty());
+
    CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
               _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
               ocl_inRange(_src, _lowerb, _upperb, _dst))
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -1363,7 +1363,7 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
 {
    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));

-#if !defined(CV_CPU_COMPILE_FP16)
+#if !CV_CPU_FORCE_FP16
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

@ -1382,7 +1382,7 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
 {
    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));

-#if !defined(CV_CPU_COMPILE_FP16)
+#if !CV_CPU_FORCE_FP16
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -488,7 +488,6 @@ GpuMat& cv::cuda::GpuMat::setTo(Scalar s, Stream& _stream)
    (void) s;
    (void) _stream;
    throw_no_cuda();
-    return *this;
 }

 GpuMat& cv::cuda::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
@ -497,7 +496,6 @@ GpuMat& cv::cuda::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
    (void) _mask;
    (void) _stream;
    throw_no_cuda();
-    return *this;
 }

 void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@ -138,7 +138,6 @@ MatAllocator* cv::cuda::HostMem::getAllocator(AllocType alloc_type)
 #ifndef HAVE_CUDA
    (void) alloc_type;
    throw_no_cuda();
-    return NULL;
 #else
    static std::map<unsigned int, Ptr<MatAllocator> > allocators;

@ -302,7 +301,6 @@ GpuMat cv::cuda::HostMem::createGpuMatHeader() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return GpuMat();
 #else
    CV_Assert( alloc_type == SHARED );

--- a/modules/core/src/cuda_info.cpp
+++ b/modules/core/src/cuda_info.cpp
@ -79,7 +79,6 @@ int cv::cuda::getDevice()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    int device;
    cudaSafeCall( cudaGetDevice(&device) );
@ -101,7 +100,6 @@ bool cv::cuda::deviceSupports(FeatureSet feature_set)
 #ifndef HAVE_CUDA
    (void) feature_set;
    throw_no_cuda();
-    return false;
 #else
    static int versions[] =
    {
@ -231,7 +229,6 @@ bool cv::cuda::TargetArchs::builtWith(cv::cuda::FeatureSet feature_set)
 #ifndef HAVE_CUDA
    (void) feature_set;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.builtWith(feature_set);
 #endif
@ -243,7 +240,6 @@ bool cv::cuda::TargetArchs::hasPtx(int major, int minor)
    (void) major;
    (void) minor;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.hasPtx(major, minor);
 #endif
@ -255,7 +251,6 @@ bool cv::cuda::TargetArchs::hasBin(int major, int minor)
    (void) major;
    (void) minor;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.hasBin(major, minor);
 #endif
@ -267,7 +262,6 @@ bool cv::cuda::TargetArchs::hasEqualOrLessPtx(int major, int minor)
    (void) major;
    (void) minor;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.hasEqualOrLessPtx(major, minor);
 #endif
@ -279,7 +273,6 @@ bool cv::cuda::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
    (void) major;
    (void) minor;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #endif
@ -291,7 +284,6 @@ bool cv::cuda::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
    (void) major;
    (void) minor;
    throw_no_cuda();
-    return false;
 #else
    return cudaArch.hasEqualOrGreaterBin(major, minor);
 #endif
@ -350,7 +342,6 @@ const char* cv::cuda::DeviceInfo::name() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return "";
 #else
    return deviceProps().get(device_id_)->name;
 #endif
@ -360,7 +351,6 @@ size_t cv::cuda::DeviceInfo::totalGlobalMem() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->totalGlobalMem;
 #endif
@ -370,7 +360,6 @@ size_t cv::cuda::DeviceInfo::sharedMemPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->sharedMemPerBlock;
 #endif
@ -380,7 +369,6 @@ int cv::cuda::DeviceInfo::regsPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->regsPerBlock;
 #endif
@ -390,7 +378,6 @@ int cv::cuda::DeviceInfo::warpSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->warpSize;
 #endif
@ -400,7 +387,6 @@ size_t cv::cuda::DeviceInfo::memPitch() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->memPitch;
 #endif
@ -410,7 +396,6 @@ int cv::cuda::DeviceInfo::maxThreadsPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxThreadsPerBlock;
 #endif
@ -420,7 +405,6 @@ Vec3i cv::cuda::DeviceInfo::maxThreadsDim() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxThreadsDim);
 #endif
@ -430,7 +414,6 @@ Vec3i cv::cuda::DeviceInfo::maxGridSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxGridSize);
 #endif
@ -440,7 +423,6 @@ int cv::cuda::DeviceInfo::clockRate() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->clockRate;
 #endif
@ -450,7 +432,6 @@ size_t cv::cuda::DeviceInfo::totalConstMem() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->totalConstMem;
 #endif
@ -460,7 +441,6 @@ int cv::cuda::DeviceInfo::majorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->major;
 #endif
@ -470,7 +450,6 @@ int cv::cuda::DeviceInfo::minorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->minor;
 #endif
@ -480,7 +459,6 @@ size_t cv::cuda::DeviceInfo::textureAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->textureAlignment;
 #endif
@ -490,7 +468,6 @@ size_t cv::cuda::DeviceInfo::texturePitchAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->texturePitchAlignment;
 #endif
@ -500,7 +477,6 @@ int cv::cuda::DeviceInfo::multiProcessorCount() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->multiProcessorCount;
 #endif
@ -510,7 +486,6 @@ bool cv::cuda::DeviceInfo::kernelExecTimeoutEnabled() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->kernelExecTimeoutEnabled != 0;
 #endif
@ -520,7 +495,6 @@ bool cv::cuda::DeviceInfo::integrated() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->integrated != 0;
 #endif
@ -530,7 +504,6 @@ bool cv::cuda::DeviceInfo::canMapHostMemory() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->canMapHostMemory != 0;
 #endif
@ -540,7 +513,6 @@ DeviceInfo::ComputeMode cv::cuda::DeviceInfo::computeMode() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return ComputeModeDefault;
 #else
    static const ComputeMode tbl[] =
    {
@ -558,7 +530,6 @@ int cv::cuda::DeviceInfo::maxTexture1D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxTexture1D;
 #endif
@ -568,7 +539,6 @@ int cv::cuda::DeviceInfo::maxTexture1DMipmap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    #if CUDA_VERSION >= 5000
        return deviceProps().get(device_id_)->maxTexture1DMipmap;
@ -583,7 +553,6 @@ int cv::cuda::DeviceInfo::maxTexture1DLinear() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxTexture1DLinear;
 #endif
@ -593,7 +562,6 @@ Vec2i cv::cuda::DeviceInfo::maxTexture2D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxTexture2D);
 #endif
@ -603,7 +571,6 @@ Vec2i cv::cuda::DeviceInfo::maxTexture2DMipmap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    #if CUDA_VERSION >= 5000
        return Vec2i(deviceProps().get(device_id_)->maxTexture2DMipmap);
@ -618,7 +585,6 @@ Vec3i cv::cuda::DeviceInfo::maxTexture2DLinear() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxTexture2DLinear);
 #endif
@ -628,7 +594,6 @@ Vec2i cv::cuda::DeviceInfo::maxTexture2DGather() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxTexture2DGather);
 #endif
@ -638,7 +603,6 @@ Vec3i cv::cuda::DeviceInfo::maxTexture3D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxTexture3D);
 #endif
@ -648,7 +612,6 @@ int cv::cuda::DeviceInfo::maxTextureCubemap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxTextureCubemap;
 #endif
@ -658,7 +621,6 @@ Vec2i cv::cuda::DeviceInfo::maxTexture1DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxTexture1DLayered);
 #endif
@ -668,7 +630,6 @@ Vec3i cv::cuda::DeviceInfo::maxTexture2DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxTexture2DLayered);
 #endif
@ -678,7 +639,6 @@ Vec2i cv::cuda::DeviceInfo::maxTextureCubemapLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxTextureCubemapLayered);
 #endif
@ -688,7 +648,6 @@ int cv::cuda::DeviceInfo::maxSurface1D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxSurface1D;
 #endif
@ -698,7 +657,6 @@ Vec2i cv::cuda::DeviceInfo::maxSurface2D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxSurface2D);
 #endif
@ -708,7 +666,6 @@ Vec3i cv::cuda::DeviceInfo::maxSurface3D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxSurface3D);
 #endif
@ -718,7 +675,6 @@ Vec2i cv::cuda::DeviceInfo::maxSurface1DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxSurface1DLayered);
 #endif
@ -728,7 +684,6 @@ Vec3i cv::cuda::DeviceInfo::maxSurface2DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec3i();
 #else
    return Vec3i(deviceProps().get(device_id_)->maxSurface2DLayered);
 #endif
@ -738,7 +693,6 @@ int cv::cuda::DeviceInfo::maxSurfaceCubemap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxSurfaceCubemap;
 #endif
@ -748,7 +702,6 @@ Vec2i cv::cuda::DeviceInfo::maxSurfaceCubemapLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return Vec2i();
 #else
    return Vec2i(deviceProps().get(device_id_)->maxSurfaceCubemapLayered);
 #endif
@ -758,7 +711,6 @@ size_t cv::cuda::DeviceInfo::surfaceAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->surfaceAlignment;
 #endif
@ -768,7 +720,6 @@ bool cv::cuda::DeviceInfo::concurrentKernels() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->concurrentKernels != 0;
 #endif
@ -778,7 +729,6 @@ bool cv::cuda::DeviceInfo::ECCEnabled() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->ECCEnabled != 0;
 #endif
@ -788,7 +738,6 @@ int cv::cuda::DeviceInfo::pciBusID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->pciBusID;
 #endif
@ -798,7 +747,6 @@ int cv::cuda::DeviceInfo::pciDeviceID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->pciDeviceID;
 #endif
@ -808,7 +756,6 @@ int cv::cuda::DeviceInfo::pciDomainID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->pciDomainID;
 #endif
@ -818,7 +765,6 @@ bool cv::cuda::DeviceInfo::tccDriver() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->tccDriver != 0;
 #endif
@ -828,7 +774,6 @@ int cv::cuda::DeviceInfo::asyncEngineCount() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->asyncEngineCount;
 #endif
@ -838,7 +783,6 @@ bool cv::cuda::DeviceInfo::unifiedAddressing() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    return deviceProps().get(device_id_)->unifiedAddressing != 0;
 #endif
@ -848,7 +792,6 @@ int cv::cuda::DeviceInfo::memoryClockRate() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->memoryClockRate;
 #endif
@ -858,7 +801,6 @@ int cv::cuda::DeviceInfo::memoryBusWidth() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->memoryBusWidth;
 #endif
@ -868,7 +810,6 @@ int cv::cuda::DeviceInfo::l2CacheSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->l2CacheSize;
 #endif
@ -878,7 +819,6 @@ int cv::cuda::DeviceInfo::maxThreadsPerMultiProcessor() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return 0;
 #else
    return deviceProps().get(device_id_)->maxThreadsPerMultiProcessor;
 #endif
@ -906,7 +846,6 @@ bool cv::cuda::DeviceInfo::isCompatible() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    // Check PTX compatibility
    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -450,7 +450,6 @@ bool cv::cuda::Stream::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    cudaError_t err = cudaStreamQuery(impl_->stream);

@ -526,8 +525,6 @@ Stream& cv::cuda::Stream::Null()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    static Stream stream;
-    return stream;
 #else
    const int deviceId = getDevice();
    return initializer.getNullStream(deviceId);
@ -716,7 +713,6 @@ GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
    (void) cols;
    (void) type;
    throw_no_cuda();
-    return GpuMat();
 #else
    GpuMat buf(allocator_);
    buf.create(rows, cols, type);
@ -806,7 +802,6 @@ bool cv::cuda::Event::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
-    return false;
 #else
    cudaError_t err = cudaEventQuery(impl_->event);

@ -833,7 +828,6 @@ float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
    (void) start;
    (void) end;
    throw_no_cuda();
-    return 0.0f;
 #else
    float ms;
    cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
--- a/modules/core/src/gl_core_3_1.cpp
+++ b/modules/core/src/gl_core_3_1.cpp
@ -146,7 +146,6 @@
    static void* IntGetProcAddress(const char*)
    {
        CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support");
-        return 0;
    }
 #endif

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -4666,7 +4666,9 @@ public:
 #endif
            {
                tempUMatFlags = UMatData::TEMP_UMAT;
-                if (u->origdata == cv::alignPtr(u->origdata, 4)) // There are OpenCL runtime issues for less aligned data
+                if (u->origdata == cv::alignPtr(u->origdata, 4)  // There are OpenCL runtime issues for less aligned data
+                    && !(u->originalUMatData && u->originalUMatData->handle)  // Avoid sharing of host memory between OpenCL buffers
+                )
                {
                    handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
                                            u->size, u->origdata, &retval);
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@ -57,7 +57,7 @@ using namespace cv::cuda;
 namespace
 {
 #ifndef HAVE_OPENGL
-inline static void throw_no_ogl() { CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support"); }
+inline static CV_NORETURN void throw_no_ogl() { CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support"); }
 #elif defined _DEBUG
 inline static bool checkError(const char* file, const int line, const char* func = 0)
 {
@ -82,8 +82,7 @@ inline static bool checkError(const char* file, const int line, const char* func
        default:
            msg = "Unknown error";
        };
-        cvError(CV_OpenGlApiCallError, func, msg, file, line);
-        return false;
+        cv::errorNoReturn(Error::OpenGlApiCallError, func, msg, file, line);
    }
    return true;
 }
@ -697,7 +696,6 @@ cv::ogl::Buffer cv::ogl::Buffer::clone(Target target, bool autoRelease) const
    (void) target;
    (void) autoRelease;
    throw_no_ogl();
-    return cv::ogl::Buffer();
 #else
    ogl::Buffer buf;
    buf.copyFrom(*this, target, autoRelease);
@ -731,7 +729,6 @@ Mat cv::ogl::Buffer::mapHost(Access access)
 #ifndef HAVE_OPENGL
    (void) access;
    throw_no_ogl();
-    return Mat();
 #else
    return Mat(rows_, cols_, type_, impl_->mapHost(access));
 #endif
@ -750,11 +747,9 @@ GpuMat cv::ogl::Buffer::mapDevice()
 {
 #ifndef HAVE_OPENGL
    throw_no_ogl();
-    return GpuMat();
 #else
    #ifndef HAVE_CUDA
        throw_no_cuda();
-        return GpuMat();
    #else
        return GpuMat(rows_, cols_, type_, impl_->mapDevice());
    #endif
@ -779,12 +774,10 @@ cuda::GpuMat cv::ogl::Buffer::mapDevice(cuda::Stream& stream)
 #ifndef HAVE_OPENGL
    (void) stream;
    throw_no_ogl();
-    return GpuMat();
 #else
    #ifndef HAVE_CUDA
        (void) stream;
        throw_no_cuda();
-        return GpuMat();
    #else
        return GpuMat(rows_, cols_, type_, impl_->mapDevice(cuda::StreamAccessor::getStream(stream)));
    #endif
@ -810,7 +803,6 @@ unsigned int cv::ogl::Buffer::bufId() const
 {
 #ifndef HAVE_OPENGL
    throw_no_ogl();
-    return 0;
 #else
    return impl_->bufId();
 #endif
@ -1216,7 +1208,6 @@ unsigned int cv::ogl::Texture2D::texId() const
 {
 #ifndef HAVE_OPENGL
    throw_no_ogl();
-    return 0;
 #else
    return impl_->texId();
 #endif
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@ -148,9 +148,8 @@ CvGenericHash* cvCreateMap( int flags, int header_size, int elem_size, CvMemStor
 void icvParseError( CvFileStorage* fs, const char* func_name,
               const char* err_msg, const char* source_file, int source_line )
 {
-    char buf[1<<10];
-    sprintf( buf, "%s(%d): %s", fs->filename, fs->lineno, err_msg );
-    cvError( CV_StsParseError, func_name, buf, source_file, source_line );
+    cv::String msg = cv::format("%s(%d): %s", fs->filename, fs->lineno, err_msg);
+    cv::errorNoReturn(cv::Error::StsParseError, func_name, msg.c_str(), source_file, source_line );
 }

 void icvFSCreateCollection( CvFileStorage* fs, int tag, CvFileNode* collection )
--- a/modules/core/src/persistence.hpp
+++ b/modules/core/src/persistence.hpp
@ -262,7 +262,7 @@ void icvFSCreateCollection( CvFileStorage* fs, int tag, CvFileNode* collection )
 char* icvFSResizeWriteBuffer( CvFileStorage* fs, char* ptr, int len );
 int icvCalcStructSize( const char* dt, int initial_size );
 int icvCalcElemSize( const char* dt, int initial_size );
-void icvParseError( CvFileStorage* fs, const char* func_name, const char* err_msg, const char* source_file, int source_line );
+void CV_NORETURN icvParseError( CvFileStorage* fs, const char* func_name, const char* err_msg, const char* source_file, int source_line );
 char* icvEncodeFormat( int elem_type, char* dt );
 int icvDecodeFormat( const char* dt, int* fmt_pairs, int max_len );
 int icvDecodeSimpleFormat( const char* dt );
--- a/modules/core/src/persistence_base64.cpp
+++ b/modules/core/src/persistence_base64.cpp
@ -84,7 +84,9 @@ size_t base64_encode(uint8_t const * src, uint8_t * dst, size_t off, size_t cnt)
    switch (rst)
    {
    case 1U: *dst_cur++ = base64_padding;
+        /* fallthrough */
    case 2U: *dst_cur++ = base64_padding;
+        /* fallthrough */
    default: *dst_cur   = 0;
        break;
    }
@ -636,7 +638,8 @@ private:
                    pack.func = to_binary<double>;
                    break;
                case 'r':
-                default: { CV_Assert(!"type not support"); break; }
+                default:
+                    CV_Error(cv::Error::StsError, "type is not supported");
                };

                offset = static_cast<size_t>(cvAlign(static_cast<int>(offset), static_cast<int>(size)));
@ -795,7 +798,8 @@ private:
                    pack.func = binary_to<double>;
                    break;
                case 'r':
-                default:  { CV_Assert(!"type not support"); break; }
+                default:
+                    CV_Error(cv::Error::StsError, "type is not supported");
                }; // need a better way for outputting error.

                offset = static_cast<size_t>(cvAlign(static_cast<int>(offset), static_cast<int>(size)));
@ -813,7 +817,8 @@ private:
                case 'f': { pack.cv_type = CV_32F; break; }
                case 'd': { pack.cv_type = CV_64F; break; }
                case 'r':
-                default:  { CV_Assert(!"type is not support"); break; }
+                default:
+                    CV_Error(cv::Error::StsError, "type is not supported");
                } // need a better way for outputting error.

                binary_to_funcs.push_back(pack);
--- a/modules/core/src/softfloat.cpp
+++ b/modules/core/src/softfloat.cpp
@ -1098,6 +1098,7 @@ static float32_t f32_roundToInt( float32_t a, uint_fast8_t roundingMode, bool ex
        switch ( roundingMode ) {
         case round_near_even:
            if ( ! fracF32UI( uiA ) ) break;
+            /* fallthrough */
         case round_near_maxMag:
            if ( exp == 0x7E ) uiZ |= packToF32UI( 0, 0x7F, 0 );
            break;
@ -1805,6 +1806,7 @@ static float64_t f64_roundToInt( float64_t a, uint_fast8_t roundingMode, bool ex
        switch ( roundingMode ) {
         case round_near_even:
            if ( ! fracF64UI( uiA ) ) break;
+            /* fallthrough */
         case round_near_maxMag:
            if ( exp == 0x3FE ) uiZ |= packToF64UI( 0, 0x3FF, 0 );
            break;
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -61,14 +61,36 @@ Mutex& getInitializationMutex()
 // force initialization (single-threaded environment)
 Mutex* __initialization_mutex_initializer = &getInitializationMutex();

+static bool param_dumpErrors = utils::getConfigurationParameterBool("OPENCV_DUMP_ERRORS",
+#if defined(_DEBUG) || defined(__ANDROID__)
+    true
+#else
+    false
+#endif
+);
+
 } // namespace cv

+#ifndef CV_ERROR_SET_TERMINATE_HANDLER  // build config option
+# if defined(_WIN32)
+#   define CV_ERROR_SET_TERMINATE_HANDLER 1
+# endif
+#endif
+#if defined(CV_ERROR_SET_TERMINATE_HANDLER) && !CV_ERROR_SET_TERMINATE_HANDLER
+# undef CV_ERROR_SET_TERMINATE_HANDLER
+#endif
+
 #ifdef _MSC_VER
 # if _MSC_VER >= 1700
 #  pragma warning(disable:4447) // Disable warning 'main' signature found without threading model
 # endif
 #endif

+#ifdef CV_ERROR_SET_TERMINATE_HANDLER
+#include <exception>      // std::set_terminate
+#include <cstdlib>        // std::abort
+#endif
+
 #if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __HAIKU__
 #  include <unistd.h>
 #  include <fcntl.h>
@ -914,26 +936,61 @@ int cv_vsnprintf(char* buf, int len, const char* fmt, va_list args)
 #endif
 }

+static void dumpException(const Exception& exc)
+{
+    const char* errorStr = cvErrorStr(exc.code);
+    char buf[1 << 12];
+
+    cv_snprintf(buf, sizeof(buf),
+        "OpenCV(%s) Error: %s (%s) in %s, file %s, line %d",
+        CV_VERSION,
+        errorStr, exc.err.c_str(), exc.func.size() > 0 ?
+        exc.func.c_str() : "unknown function", exc.file.c_str(), exc.line);
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_ERROR, "cv::error()", "%s", buf);
+#else
+    fflush(stdout); fflush(stderr);
+    fprintf(stderr, "%s\n", buf);
+    fflush(stderr);
+#endif
+}
+
+#ifdef CV_ERROR_SET_TERMINATE_HANDLER
+static bool cv_terminate_handler_installed = false;
+static std::terminate_handler cv_old_terminate_handler;
+static cv::Exception cv_terminate_handler_exception;
+static bool param_setupTerminateHandler = utils::getConfigurationParameterBool("OPENCV_SETUP_TERMINATE_HANDLER", true);
+static void cv_terminate_handler() {
+    std::cerr << "OpenCV: terminate handler is called! The last OpenCV error is:\n";
+    dumpException(cv_terminate_handler_exception);
+    if (false /*cv_old_terminate_handler*/)  // buggy behavior is observed with doubled "abort/retry/ignore" windows
+        cv_old_terminate_handler();
+    abort();
+}
+
+#endif
+
 void error( const Exception& exc )
 {
+#ifdef CV_ERROR_SET_TERMINATE_HANDLER
+    {
+        cv::AutoLock lock(getInitializationMutex());
+        if (!cv_terminate_handler_installed)
+        {
+            if (param_setupTerminateHandler)
+                cv_old_terminate_handler = std::set_terminate(cv_terminate_handler);
+            cv_terminate_handler_installed = true;
+        }
+        cv_terminate_handler_exception = exc;
+    }
+#endif
+
    if (customErrorCallback != 0)
        customErrorCallback(exc.code, exc.func.c_str(), exc.err.c_str(),
                            exc.file.c_str(), exc.line, customErrorCallbackData);
-    else
+    else if (param_dumpErrors)
    {
-        const char* errorStr = cvErrorStr(exc.code);
-        char buf[1 << 12];
-
-        cv_snprintf(buf, sizeof(buf),
-            "OpenCV(%s) Error: %s (%s) in %s, file %s, line %d",
-            CV_VERSION,
-            errorStr, exc.err.c_str(), exc.func.size() > 0 ?
-            exc.func.c_str() : "unknown function", exc.file.c_str(), exc.line);
-        fprintf( stderr, "%s\n", buf );
-        fflush( stderr );
-#  ifdef __ANDROID__
-        __android_log_print(ANDROID_LOG_ERROR, "cv::error()", "%s", buf);
-#  endif
+        dumpException(exc);
    }

    if(breakOnError)
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -381,6 +381,7 @@ UMat Mat::getUMat(int accessFlags, UMatUsageFlags usageFlags) const
        if(!a)
            a = a0;
        new_u = a->allocate(dims, size.p, type(), data, step.p, accessFlags, usageFlags);
+        new_u->originalUMatData = u;
    }
    bool allocated = false;
    CV_TRY
@ -404,7 +405,6 @@ UMat Mat::getUMat(int accessFlags, UMatUsageFlags usageFlags) const
            CV_Assert(new_u->tempUMat());
        }
 #endif
-        new_u->originalUMatData = u;
        CV_XADD(&(u->refcount), 1);
        CV_XADD(&(u->urefcount), 1);
    }
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -215,6 +215,8 @@ TEST(hal_intrin, float32x4) {
        .test_matmul()
        .test_transpose()
        .test_reduce_sum4()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        ;
 }

@ -233,6 +235,8 @@ TEST(hal_intrin, float64x2) {
        .test_unpack()
        .test_float_math()
        .test_float_cvt32()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
        ;
 }
 #endif
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -120,7 +120,7 @@ static inline Mat getPlane(const Mat &m, int n, int cn)
    return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr<float>(n, cn));
 }

-static inline MatShape shape(const int* dims, const int n = 4)
+static inline MatShape shape(const int* dims, const int n)
 {
    MatShape shape;
    shape.assign(dims, dims + n);
@ -132,6 +132,11 @@ static inline MatShape shape(const Mat& mat)
    return shape(mat.size.p, mat.dims);
 }

+static inline MatShape shape(const MatSize& sz)
+{
+    return shape(sz.p, sz[-1]);
+}
+
 static inline MatShape shape(const UMat& mat)
 {
    return shape(mat.size.p, mat.dims);
@ -142,7 +147,7 @@ namespace {inline bool is_neg(int i) { return i < 0; }}
 static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
 {
    int dims[] = {a0, a1, a2, a3};
-    MatShape s = shape(dims);
+    MatShape s = shape(dims, 4);
    s.erase(std::remove_if(s.begin(), s.end(), is_neg), s.end());
    return s;
 }
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -95,24 +95,18 @@ PERF_TEST_P_(DNNTestNetwork, AlexNet)

 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
            "", Mat(cv::Size(224, 224), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
            "resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
            "squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
 }
@ -217,6 +211,16 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
            Mat(cv::Size(300, 300), CV_32FC3));
 }

+PERF_TEST_P_(DNNTestNetwork, YOLOv3)
+{
+    if (backend != DNN_BACKEND_DEFAULT)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/dog416.png", false));
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3);
+    processNet("dnn/yolov3.cfg", "dnn/yolov3.weights", "", inp / 255);
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -89,6 +89,8 @@ namespace cv {
                return init_val;
            }

+            static const std::string kFirstLayerName = "data";
+
            class setLayersParams {

                NetParameter *net;
@ -97,8 +99,8 @@ namespace cv {
                std::vector<std::string> fused_layer_names;

            public:
-                setLayersParams(NetParameter *_net, std::string _first_layer = "data") :
-                    net(_net), layer_id(0), last_layer(_first_layer)
+                setLayersParams(NetParameter *_net) :
+                    net(_net), layer_id(0), last_layer(kFirstLayerName)
                {}

                void setLayerBlobs(int i, std::vector<cv::Mat> blobs)
@ -275,7 +277,7 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

-                void setPermute()
+                void setPermute(bool isDarknetLayer = true)
                {
                    cv::dnn::LayerParams permute_params;
                    permute_params.name = "Permute-name";
@ -294,8 +296,11 @@ namespace cv {
                    last_layer = layer_name;
                    net->layers.push_back(lp);

-                    layer_id++;
-                    fused_layer_names.push_back(last_layer);
+                    if (isDarknetLayer)
+                    {
+                        layer_id++;
+                        fused_layer_names.push_back(last_layer);
+                    }
                }

                void setRegion(float thresh, int coords, int classes, int anchors, int classfix, int softmax, int softmax_tree, float *biasData)
@ -327,6 +332,85 @@ namespace cv {
                    layer_id++;
                    fused_layer_names.push_back(last_layer);
                }
+
+                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors)
+                {
+                    cv::dnn::LayerParams region_param;
+                    region_param.name = "Region-name";
+                    region_param.type = "Region";
+
+                    const int numAnchors = mask.size();
+
+                    region_param.set<int>("classes", classes);
+                    region_param.set<int>("anchors", numAnchors);
+                    region_param.set<bool>("logistic", true);
+
+                    std::vector<float> usedAnchors(numAnchors * 2);
+                    for (int i = 0; i < numAnchors; ++i)
+                    {
+                        usedAnchors[i * 2] = anchors[mask[i] * 2];
+                        usedAnchors[i * 2 + 1] = anchors[mask[i] * 2 + 1];
+                    }
+
+                    cv::Mat biasData_mat = cv::Mat(1, numAnchors * 2, CV_32F, &usedAnchors[0]).clone();
+                    region_param.blobs.push_back(biasData_mat);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("yolo_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = region_param.type;
+                    lp.layerParams = region_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(kFirstLayerName);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setShortcut(int from)
+                {
+                    cv::dnn::LayerParams shortcut_param;
+                    shortcut_param.name = "Shortcut-name";
+                    shortcut_param.type = "Eltwise";
+
+                    shortcut_param.set<std::string>("op", "sum");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("shortcut_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = shortcut_param.type;
+                    lp.layerParams = shortcut_param;
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setUpsample(int scaleFactor)
+                {
+                    cv::dnn::LayerParams param;
+                    param.name = "Upsample-name";
+                    param.type = "ResizeNearestNeighbor";
+
+                    param.set<int>("zoom_factor", scaleFactor);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("upsample_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = param.type;
+                    lp.layerParams = param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
            };

            std::string escapeString(const std::string &src)
@ -464,7 +548,7 @@ namespace cv {

                        current_channels = 0;
                        for (size_t k = 0; k < layers_vec.size(); ++k) {
-                            layers_vec[k] += layers_counter;
+                            layers_vec[k] = layers_vec[k] > 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
                            current_channels += net->out_channels_vec[layers_vec[k]];
                        }

@ -496,9 +580,43 @@ namespace cv {

                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());

-                        setParams.setPermute();
+                        setParams.setPermute(false);
                        setParams.setRegion(thresh, coords, classes, num_of_anchors, classfix, softmax, softmax_tree, anchors_vec.data());
                    }
+                    else if (layer_type == "shortcut")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+
+                        from += layers_counter;
+                        current_channels = net->out_channels_vec[from];
+
+                        setParams.setShortcut(from);
+                    }
+                    else if (layer_type == "upsample")
+                    {
+                        int scaleFactor = getParam<int>(layer_params, "stride", 1);
+                        setParams.setUpsample(scaleFactor);
+                    }
+                    else if (layer_type == "yolo")
+                    {
+                        int classes = getParam<int>(layer_params, "classes", -1);
+                        int num_of_anchors = getParam<int>(layer_params, "num", -1);
+
+                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
+                        CV_Assert(!anchors_values.empty());
+                        std::vector<float> anchors_vec = getNumbers<float>(anchors_values);
+
+                        std::string mask_values = getParam<std::string>(layer_params, "mask", std::string());
+                        CV_Assert(!mask_values.empty());
+                        std::vector<int> mask_vec = getNumbers<int>(mask_values);
+
+                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());
+
+                        setParams.setPermute(false);
+                        setParams.setYolo(classes, mask_vec, anchors_vec);
+                    }
                    else {
                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
                    }
@ -598,6 +716,10 @@ namespace cv {
                        if(activation == "leaky")
                            ++cv_layers_counter;
                    }
+                    if (layer_type == "region" || layer_type == "yolo")
+                    {
+                        ++cv_layers_counter;  // For permute.
+                    }
                    current_channels = net->out_channels_vec[darknet_layers_counter];
                }
                return true;
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1255,6 +1255,15 @@ struct Net::Impl
                    if (weightableLayer->_biases)
                        weightableLayer->_biases = convertFp16(weightableLayer->_biases);
                }
+                else
+                {
+                    for (const auto& weights : {"weights", "biases"})
+                    {
+                        auto it = ieNode->layer->blobs.find(weights);
+                        if (it != ieNode->layer->blobs.end())
+                            it->second = convertFp16(it->second);
+                    }
+                }
            }

            ieNode->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers);
@ -1527,12 +1536,11 @@ struct Net::Impl
                                convLayer = downLayerData->layerInstance.dynamicCast<ConvolutionLayer>();

                            //  first input layer is convolution layer
-                            if( !convLayer.empty() )
+                            if( !convLayer.empty() && eltwiseData->consumers.size() == 1 )
                            {
                                // fuse eltwise + activation layer
                                LayerData *firstConvLayerData = downLayerData;
                                {
-                                    CV_Assert(eltwiseData->consumers.size() == 1);
                                    nextData = &layers[eltwiseData->consumers[0].lid];
                                    lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
                                    Ptr<ActivationLayer> nextActivLayer;
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -234,7 +234,7 @@ public:
        CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);

        int dims[] = {inputs[0][0], outCn, out.height, out.width};
-        outputs.resize(inputs.size(), shape(dims));
+        outputs.resize(inputs.size(), shape(dims, 4));

        return false;
    }
@ -1088,7 +1088,7 @@ public:
        CV_Assert(blobs[0].size[0] == inpCn);

        int dims[] = {inputs[0][0], outCn, outH, outW};
-        outputs.resize(inputs.size(), shape(dims));
+        outputs.resize(inputs.size(), shape(dims, 4));

        internals.push_back(MatShape());
        if (!is1x1())
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -853,7 +853,7 @@ public:
            dims[0] = inputs[1][0];  // Number of proposals;
            dims[1] = psRoiOutChannels;
        }
-        outputs.assign(type == MAX ? 2 : 1, shape(dims));
+        outputs.assign(type == MAX ? 2 : 1, shape(dims, 4));
        return false;
    }

--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -295,6 +295,19 @@ public:
        return false;
    }

+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 1, inputs[0]->dims == 4, inputs[1]->dims == 4);
+        int layerWidth = inputs[0]->size[3];
+        int layerHeight = inputs[0]->size[2];
+
+        int imageWidth = inputs[1]->size[3];
+        int imageHeight = inputs[1]->size[2];
+
+        _stepY = _stepY == 0 ? (static_cast<float>(imageHeight) / layerHeight) : _stepY;
+        _stepX = _stepX == 0 ? (static_cast<float>(imageWidth) / layerWidth) : _stepX;
+    }
+
 #ifdef HAVE_OPENCL
    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
@ -310,16 +323,6 @@ public:
        int _imageWidth = inputs[1].size[3];
        int _imageHeight = inputs[1].size[2];

-        float stepX, stepY;
-        if (_stepX == 0 || _stepY == 0)
-        {
-            stepX = static_cast<float>(_imageWidth) / _layerWidth;
-            stepY = static_cast<float>(_imageHeight) / _layerHeight;
-        } else {
-            stepX = _stepX;
-            stepY = _stepY;
-        }
-
        if (umat_offsetsX.empty())
        {
            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
@ -339,8 +342,8 @@ public:

        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
        kernel.set(0, (int)nthreads);
-        kernel.set(1, (float)stepX);
-        kernel.set(2, (float)stepY);
+        kernel.set(1, (float)_stepX);
+        kernel.set(2, (float)_stepY);
        kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
        kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
        kernel.set(5, (int)_offsetsX.size());
@ -410,15 +413,6 @@ public:
        int _imageWidth = inputs[1]->size[3];
        int _imageHeight = inputs[1]->size[2];

-        float stepX, stepY;
-        if (_stepX == 0 || _stepY == 0) {
-          stepX = static_cast<float>(_imageWidth) / _layerWidth;
-          stepY = static_cast<float>(_imageHeight) / _layerHeight;
-        } else {
-          stepX = _stepX;
-          stepY = _stepY;
-        }
-
        float* outputPtr = outputs[0].ptr<float>();
        float _boxWidth, _boxHeight;
        for (size_t h = 0; h < _layerHeight; ++h)
@ -431,8 +425,8 @@ public:
                    _boxHeight = _boxHeights[i];
                    for (int j = 0; j < _offsetsX.size(); ++j)
                    {
-                        float center_x = (w + _offsetsX[j]) * stepX;
-                        float center_y = (h + _offsetsY[j]) * stepY;
+                        float center_x = (w + _offsetsX[j]) * _stepX;
+                        float center_y = (h + _offsetsY[j]) * _stepY;
                        outputPtr = addPrior(center_x, center_y, _boxWidth, _boxHeight, _imageWidth,
                                             _imageHeight, _bboxesNormalized, outputPtr);
                    }
@ -495,7 +489,7 @@ public:
                ieLayer->params["aspect_ratio"] += format(",%f", _aspectRatios[i]);
        }

-        ieLayer->params["flip"] = _flip ? "1" : "0";
+        ieLayer->params["flip"] = "0";  // We already flipped aspect ratios.
        ieLayer->params["clip"] = _clip ? "1" : "0";

        CV_Assert(!_variance.empty());
@ -503,12 +497,20 @@ public:
        for (int i = 1; i < _variance.size(); ++i)
            ieLayer->params["variance"] += format(",%f", _variance[i]);

-        ieLayer->params["step"] = _stepX == _stepY ? format("%f", _stepX) : "0";
-        ieLayer->params["step_h"] = _stepY;
-        ieLayer->params["step_w"] = _stepX;
-
+        if (_stepX == _stepY)
+        {
+            ieLayer->params["step"] = format("%f", _stepX);
+            ieLayer->params["step_h"] = "0.0";
+            ieLayer->params["step_w"] = "0.0";
+        }
+        else
+        {
+            ieLayer->params["step"] = "0.0";
+            ieLayer->params["step_h"] = format("%f", _stepY);
+            ieLayer->params["step_w"] = format("%f", _stepX);
+        }
        CV_Assert(_offsetsX.size() == 1, _offsetsY.size() == 1, _offsetsX[0] == _offsetsY[0]);
-        ieLayer->params["offset"] = format("%f", _offsetsX[0]);;
+        ieLayer->params["offset"] = format("%f", _offsetsX[0]);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@ -59,7 +59,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
 public:
    int coords, classes, anchors, classfix;
    float thresh, nmsThreshold;
-    bool useSoftmaxTree, useSoftmax;
+    bool useSoftmax, useLogistic;

    RegionLayerImpl(const LayerParams& params)
    {
@ -71,15 +71,17 @@ public:
        classes = params.get<int>("classes", 0);
        anchors = params.get<int>("anchors", 5);
        classfix = params.get<int>("classfix", 0);
-        useSoftmaxTree = params.get<bool>("softmax_tree", false);
        useSoftmax = params.get<bool>("softmax", false);
+        useLogistic = params.get<bool>("logistic", false);
        nmsThreshold = params.get<float>("nms_threshold", 0.4);

        CV_Assert(nmsThreshold >= 0.);
        CV_Assert(coords == 4);
        CV_Assert(classes >= 1);
        CV_Assert(anchors >= 1);
-        CV_Assert(useSoftmaxTree || useSoftmax);
+        CV_Assert(useLogistic || useSoftmax);
+        if (params.get<bool>("softmax_tree", false))
+            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -89,7 +91,7 @@ public:
    {
        CV_Assert(inputs.size() > 0);
        CV_Assert(inputs[0][3] == (1 + coords + classes)*anchors);
-        outputs = std::vector<MatShape>(inputs.size(), shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        outputs = std::vector<MatShape>(1, shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
        return false;
    }

@ -124,14 +126,13 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        // TODO: implement a logistic activation to classification scores.
+        if (useLogistic)
+            return false;
+
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

-        if (useSoftmaxTree) {   // Yolo 9000
-            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-            return false;
-        }
-
        CV_Assert(inputs.size() >= 1);
        int const cell_size = classes + coords + 1;
        UMat blob_umat = blobs[0].getUMat(ACCESS_READ);
@ -203,6 +204,7 @@ public:
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

        CV_Assert(inputs.size() >= 1);
+        CV_Assert(outputs.size() == 1);
        int const cell_size = classes + coords + 1;

        const float* biasData = blobs[0].ptr<float>();
@ -214,6 +216,9 @@ public:

            int rows = inpBlob.size[1];
            int cols = inpBlob.size[2];
+            CV_Assert(inputs.size() < 2 || inputs[1]->dims == 4);
+            int hNorm = inputs.size() > 1 ? inputs[1]->size[2] : rows;
+            int wNorm = inputs.size() > 1 ? inputs[1]->size[3] : cols;

            const float *srcData = inpBlob.ptr<float>();
            float *dstData = outBlob.ptr<float>();
@ -225,49 +230,47 @@ public:
                dstData[index + 4] = logistic_activate(x);	// logistic activation
            }

-            if (useSoftmaxTree) {   // Yolo 9000
-                CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-            }
-            else if (useSoftmax) {  // Yolo v2
+            if (useSoftmax) {  // Yolo v2
                // softmax activation for Probability, for each grid cell (X x Y x Anchor-index)
                for (int i = 0; i < rows*cols*anchors; ++i) {
                    int index = cell_size*i;
                    softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
                }
-
-                for (int x = 0; x < cols; ++x)
-                    for(int y = 0; y < rows; ++y)
-                        for (int a = 0; a < anchors; ++a) {
-                            int index = (y*cols + x)*anchors + a;	// index for each grid-cell & anchor
-                            int p_index = index * cell_size + 4;
-                            float scale = dstData[p_index];
-                            if (classfix == -1 && scale < .5) scale = 0;	// if(t0 < 0.5) t0 = 0;
-                            int box_index = index * cell_size;
-
-                            dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols;
-                            dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows;
-                            dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / cols;
-                            dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / rows;
-
-                            int class_index = index * cell_size + 5;
-
-                            if (useSoftmaxTree) {
-                                CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-                            }
-                            else {
-                                for (int j = 0; j < classes; ++j) {
-                                    float prob = scale*dstData[class_index + j];	// prob = IoU(box, object) = t0 * class-probability
-                                    dstData[class_index + j] = (prob > thresh) ? prob : 0;		// if (IoU < threshold) IoU = 0;
-                                }
-                            }
-                        }
-
            }
-
+            else if (useLogistic) {  // Yolo v3
+                for (int i = 0; i < rows*cols*anchors; ++i)
+                {
+                    int index = cell_size*i;
+                    const float* input = srcData + index + 5;
+                    float* output = dstData + index + 5;
+                    for (int i = 0; i < classes; ++i)
+                        output[i] = logistic_activate(input[i]);
+                }
+            }
+            for (int x = 0; x < cols; ++x)
+                for(int y = 0; y < rows; ++y)
+                    for (int a = 0; a < anchors; ++a) {
+                        int index = (y*cols + x)*anchors + a;  // index for each grid-cell & anchor
+                        int p_index = index * cell_size + 4;
+                        float scale = dstData[p_index];
+                        if (classfix == -1 && scale < .5) scale = 0;  // if(t0 < 0.5) t0 = 0;
+                        int box_index = index * cell_size;
+
+                        dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols;
+                        dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows;
+                        dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / hNorm;
+                        dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / wNorm;
+
+                        int class_index = index * cell_size + 5;
+
+                        for (int j = 0; j < classes; ++j) {
+                            float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                            dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                        }
+                    }
            if (nmsThreshold > 0) {
                do_nms_sort(dstData, rows*cols*anchors, thresh, nmsThreshold);
            }
-
        }
    }

--- a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp
+++ b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp
@ -16,9 +16,11 @@ public:
    ResizeNearestNeighborLayerImpl(const LayerParams& params)
    {
        setParamsFrom(params);
-        CV_Assert(params.has("width"), params.has("height"));
-        outWidth = params.get<float>("width");
-        outHeight = params.get<float>("height");
+        CV_Assert(params.has("width") && params.has("height") || params.has("zoom_factor"));
+        CV_Assert(!params.has("width") && !params.has("height") || !params.has("zoom_factor"));
+        outWidth = params.get<float>("width", 0);
+        outHeight = params.get<float>("height", 0);
+        zoomFactor = params.get<int>("zoom_factor", 1);
        alignCorners = params.get<bool>("align_corners", false);
        if (alignCorners)
            CV_Error(Error::StsNotImplemented, "Nearest neighborhood resize with align_corners=true is not implemented");
@ -31,12 +33,21 @@ public:
    {
        CV_Assert(inputs.size() == 1, inputs[0].size() == 4);
        outputs.resize(1, inputs[0]);
-        outputs[0][2] = outHeight;
-        outputs[0][3] = outWidth;
+        outputs[0][2] = outHeight > 0 ? outHeight : (outputs[0][2] * zoomFactor);
+        outputs[0][3] = outWidth > 0 ? outWidth : (outputs[0][3] * zoomFactor);
        // We can work in-place (do nothing) if input shape == output shape.
        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
    }

+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+    }
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
@ -65,7 +76,7 @@ public:
        }
    }
 private:
-    int outWidth, outHeight;
+    int outWidth, outHeight, zoomFactor;
    bool alignCorners;
 };

--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -139,7 +139,6 @@ InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net)
    inputs = net.getInputsInfo();
    outputs = net.getOutputsInfo();
    layers.resize(net.layerCount());  // A hack to execute InfEngineBackendNet::layerCount correctly.
-    initPlugin(net);
 }

 void InfEngineBackendNet::Release() noexcept
@ -234,8 +233,16 @@ InferenceEngine::StatusCode
 InfEngineBackendNet::getLayerByName(const char *layerName, InferenceEngine::CNNLayerPtr &out,
                                    InferenceEngine::ResponseDesc *resp) noexcept
 {
-    CV_Error(Error::StsNotImplemented, "");
-    return InferenceEngine::StatusCode::OK;
+    for (auto& l : layers)
+    {
+        if (l->name == layerName)
+        {
+            out = l;
+            return InferenceEngine::StatusCode::OK;
+        }
+    }
+    CV_Error(Error::StsObjectNotFound, cv::format("Cannot find a layer %s", layerName));
+    return InferenceEngine::StatusCode::NOT_FOUND;
 }

 void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device) noexcept
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@ -419,6 +419,125 @@ public:
    }
 };

+class DeconvolutionValidKerasSubgraph : public Subgraph
+{
+public:
+    DeconvolutionValidKerasSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int shape = addNodeToMatch("Shape", input);
+        int kernel = addNodeToMatch("Const");
+
+        int stack = addNodeToMatch("Const");
+        int stack_1 = addNodeToMatch("Const");
+        int stack_2 = addNodeToMatch("Const");
+        int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        int strided_slice_1 = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        int strided_slice_2 = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        int mul = addNodeToMatch("Mul", strided_slice_1, addNodeToMatch("Const"));
+        int add = addNodeToMatch("Add", mul, addNodeToMatch("Const"));
+
+        int mul_1 = addNodeToMatch("Mul", strided_slice_2, addNodeToMatch("Const"));
+        int add_1 = addNodeToMatch("Add", mul_1, addNodeToMatch("Const"));
+        int pack = addNodeToMatch("Pack", strided_slice, add, add_1, addNodeToMatch("Const"));
+        addNodeToMatch("Conv2DBackpropInput", pack, kernel, input);
+        // Put any unused Const op to the first input.
+        setFusedNode("Conv2DBackpropInput", stack, kernel, input);
+    }
+
+    virtual void finalize(tensorflow::GraphDef&, tensorflow::NodeDef* fusedNode,
+                          std::vector<tensorflow::NodeDef*>& inputNodes) CV_OVERRIDE
+    {
+        // Disable adjusted paddings (see Conv2DBackpropInput layer at tf_importer.cpp)
+        // adj_w = (outW - (pad == "SAME") ? 1 : kernelW) % strideX;
+        // adj_h = (outH - (pad == "SAME") ? 1 : kernelH) % strideY;
+        // Where outH and outW are 1st and 2nd dimensions (NHWC) or 2nd and third (NCHW).
+        std::string padMode = fusedNode->attr().at("padding").s();
+        CV_Assert(padMode == "VALID");
+
+        const tensorflow::TensorShapeProto& kernelShape =
+            inputNodes[1]->mutable_attr()->at("value").tensor().tensor_shape();
+
+        CV_Assert(kernelShape.dim_size() == 4);
+        const int kernelHeight = kernelShape.dim(0).size();
+        const int kernelWidth = kernelShape.dim(1).size();
+
+        tensorflow::TensorProto* outShape = inputNodes[0]->mutable_attr()->at("value").mutable_tensor();
+        outShape->clear_int_val();
+        outShape->add_int_val(-1);
+        outShape->add_int_val(kernelHeight);
+        outShape->add_int_val(kernelWidth);
+        outShape->add_int_val(-1);
+    }
+};
+
+class DeconvolutionSameKerasSubgraph : public Subgraph
+{
+public:
+    DeconvolutionSameKerasSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int shape = addNodeToMatch("Shape", input);
+        int kernel = addNodeToMatch("Const");
+
+        int stack = addNodeToMatch("Const");
+        int stack_1 = addNodeToMatch("Const");
+        int stack_2 = addNodeToMatch("Const");
+        int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        int strided_slice_1 = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        int strided_slice_2 = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+
+        int mul = addNodeToMatch("Mul", strided_slice_1, addNodeToMatch("Const"));
+
+        int mul_1 = addNodeToMatch("Mul", strided_slice_2, addNodeToMatch("Const"));
+        int pack = addNodeToMatch("Pack", strided_slice, mul, mul_1, addNodeToMatch("Const"));
+        addNodeToMatch("Conv2DBackpropInput", pack, kernel, input);
+        // Put any unused Const op to the first input.
+        setFusedNode("Conv2DBackpropInput", stack, kernel, input);
+    }
+
+    virtual void finalize(tensorflow::GraphDef&, tensorflow::NodeDef* fusedNode,
+                          std::vector<tensorflow::NodeDef*>& inputNodes) CV_OVERRIDE
+    {
+        // Disable adjusted paddings (see Conv2DBackpropInput layer at tf_importer.cpp)
+        // adj_w = (outW - (pad == "SAME") ? 1 : kernelW) % strideX;
+        // adj_h = (outH - (pad == "SAME") ? 1 : kernelH) % strideY;
+        // Where outH and outW are 1st and 2nd dimensions (NHWC) or 2nd and third (NCHW).
+        std::string padMode = fusedNode->attr().at("padding").s();
+        CV_Assert(padMode == "SAME");
+
+        const tensorflow::AttrValue_ListValue& strides = fusedNode->attr().at("strides").list();
+        CV_Assert(strides.i_size() == 4);
+
+        const int strideY = strides.i(1);
+        const int strideX = strides.i(2);
+
+        tensorflow::TensorProto* outShape = inputNodes[0]->mutable_attr()->at("value").mutable_tensor();
+        outShape->clear_int_val();
+        outShape->add_int_val(-1);
+        outShape->add_int_val(strideY);
+        outShape->add_int_val(strideX);
+        outShape->add_int_val(-1);
+    }
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
    std::vector<Ptr<Subgraph> > subgraphs;
@ -430,6 +549,8 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
    subgraphs.push_back(Ptr<Subgraph>(new ReLU6KerasSubgraph()));
    subgraphs.push_back(Ptr<Subgraph>(new ReshapeKerasSubgraph(3)));
    subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));

    int numNodes = net.node_size();
    std::vector<int> matchedNodesIds;
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1303,8 +1303,8 @@ void TFImporter::populateNet(Net dstNet)
            const int strideY = layerParams.get<int>("stride_h");
            const int strideX = layerParams.get<int>("stride_w");
            Mat outShape = getTensorContent(getConstBlob(layer, value_id, 0));
-            const int outH = outShape.at<int>(2);
-            const int outW = outShape.at<int>(1);
+            const int outH = outShape.at<int>(1);
+            const int outW = outShape.at<int>(2);
            if (layerParams.get<String>("pad_mode") == "SAME")
            {
                layerParams.set("adj_w", (outW - 1) % strideX);
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -23,9 +23,9 @@ public:
    }

    void processNet(const std::string& weights, const std::string& proto,
-                    Size inpSize, const std::string& outputLayer,
+                    Size inpSize, const std::string& outputLayer = "",
                    const std::string& halideScheduler = "",
-                    double l1 = 1e-5, double lInf = 1e-4)
+                    double l1 = 0.0, double lInf = 0.0)
    {
        // Create a common input blob.
        int blobSize[] = {1, 3, inpSize.height, inpSize.width};
@ -36,9 +36,9 @@ public:
    }

    void processNet(std::string weights, std::string proto,
-                    Mat inp, const std::string& outputLayer,
+                    Mat inp, const std::string& outputLayer = "",
                    std::string halideScheduler = "",
-                    double l1 = 1e-5, double lInf = 1e-4)
+                    double l1 = 0.0, double lInf = 0.0)
    {
        if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
        {
@ -49,6 +49,16 @@ public:
                throw SkipTestException("OpenCL is not available/disabled in OpenCV");
            }
        }
+        if (target == DNN_TARGET_OPENCL_FP16)
+        {
+            l1 = l1 == 0.0 ? 4e-3 : l1;
+            lInf = lInf == 0.0 ? 2e-2 : lInf;
+        }
+        else
+        {
+            l1 = l1 == 0.0 ? 1e-5 : l1;
+            lInf = lInf == 0.0 ? 1e-4 : lInf;
+        }
        weights = findDataFile(weights, false);
        if (!proto.empty())
            proto = findDataFile(proto, false);
@ -71,31 +81,28 @@ public:
        Mat out = net.forward(outputLayer).clone();

        if (outputLayer == "detection_out")
-            checkDetections(outDefault, out, "First run", l1, lInf);
+            normAssertDetections(outDefault, out, "First run", 0.2, l1, lInf);
        else
            normAssert(outDefault, out, "First run", l1, lInf);

        // Test 2: change input.
-        inp *= 0.1f;
+        float* inpData = (float*)inp.data;
+        for (int i = 0; i < inp.size[0] * inp.size[1]; ++i)
+        {
+            Mat slice(inp.size[2], inp.size[3], CV_32F, inpData);
+            cv::flip(slice, slice, 1);
+            inpData += slice.total();
+        }
        netDefault.setInput(inp);
        net.setInput(inp);
        outDefault = netDefault.forward(outputLayer).clone();
        out = net.forward(outputLayer).clone();

        if (outputLayer == "detection_out")
-            checkDetections(outDefault, out, "Second run", l1, lInf);
+            normAssertDetections(outDefault, out, "Second run", 0.2, l1, lInf);
        else
            normAssert(outDefault, out, "Second run", l1, lInf);
    }
-
-    void checkDetections(const Mat& out, const Mat& ref, const std::string& msg,
-                         float l1, float lInf, int top = 5)
-    {
-        top = std::min(std::min(top, out.size[2]), out.size[3]);
-        std::vector<cv::Range> range(4, cv::Range::all());
-        range[2] = cv::Range(0, top);
-        normAssert(out(range), ref(range));
-    }
 };

 TEST_P(DNNTestNetwork, AlexNet)
@ -110,8 +117,6 @@ TEST_P(DNNTestNetwork, AlexNet)

 TEST_P(DNNTestNetwork, ResNet_50)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
               Size(224, 224), "prob",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_resnet_50.yml" :
@ -120,8 +125,6 @@ TEST_P(DNNTestNetwork, ResNet_50)

 TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
               Size(227, 227), "prob",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml" :
@ -130,8 +133,6 @@ TEST_P(DNNTestNetwork, SqueezeNet_v1_1)

 TEST_P(DNNTestNetwork, GoogLeNet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
-        throw SkipTestException("");
    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
               Size(224, 224), "prob");
 }
@ -180,7 +181,7 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
 {
    if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
        backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE)
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
        throw SkipTestException("");
    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
               "dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
@ -189,30 +190,24 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
 TEST_P(DNNTestNetwork, OpenPose_pose_coco)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
-    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 3e-5 : 1e-5;
-    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 3e-3 : 1e-4;
    processNet("dnn/openpose_pose_coco.caffemodel", "dnn/openpose_pose_coco.prototxt",
-               Size(368, 368), "", "", l1, lInf);
+               Size(368, 368));
 }

 TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
-    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 4e-5 : 1e-5;
-    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 7e-3 : 1e-4;
    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
-               Size(368, 368), "", "", l1, lInf);
+               Size(368, 368));
 }

 TEST_P(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
-    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 5e-5 : 1e-5;
-    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 5e-3 : 1e-4;
    // The same .caffemodel but modified .prototxt
    // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt",
-               Size(368, 368), "", "", l1, lInf);
+               Size(368, 368));
 }

 TEST_P(DNNTestNetwork, OpenFace)
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -167,7 +167,7 @@ TEST(Reproducibility_SSD, Accuracy)
    Mat out = net.forward("detection_out");

    Mat ref = blobFromNPY(_tf("ssd_out.npy"));
-    normAssert(ref, out);
+    normAssertDetections(ref, out);
 }

 typedef testing::TestWithParam<DNNTarget> Reproducibility_MobileNet_SSD;
@ -186,7 +186,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    Mat out = net.forward();

    Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
-    normAssert(ref, out);
+    normAssertDetections(ref, out);

    // Check that detections aren't preserved.
    inp.setTo(0.0f);
@ -403,14 +403,13 @@ TEST_P(opencv_face_detector, Accuracy)
    // Output has shape 1x1xNx7 where N - number of detections.
    // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
    Mat out = net.forward();
-
-    Mat ref = (Mat_<float>(6, 5) << 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
-                                    0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
-                                    0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
-                                    0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
-                                    0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
-                                    0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssert(out.reshape(1, out.total() / 7).rowRange(0, 6).colRange(2, 7), ref);
+    Mat ref = (Mat_<float>(6, 7) << 0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
+                                    0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
+                                    0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
+                                    0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
+                                    0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
+                                    0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
+    normAssertDetections(ref, out, "", 0.5, 1e-5, 2e-4);
 }
 INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
    Combine(
@ -426,14 +425,14 @@ TEST(Test_Caffe, FasterRCNN_and_RFCN)
                            "resnet50_rfcn_final.caffemodel"};
    std::string protos[] = {"faster_rcnn_vgg16.prototxt", "faster_rcnn_zf.prototxt",
                            "rfcn_pascal_voc_resnet50.prototxt"};
-    Mat refs[] = {(Mat_<float>(3, 6) << 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
-                                        7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
-                                        12, 0.993028, 133.221, 189.377, 350.994, 563.166),
-                  (Mat_<float>(3, 6) << 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
-                                        7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
-                                        12, 0.967198, 138.588, 206.843, 329.766, 553.176),
-                  (Mat_<float>(2, 6) << 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
-                                        12, 0.94786, 132.093, 223.903, 338.077, 566.16)};
+    Mat refs[] = {(Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
+                                        0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
+                                        0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166),
+                  (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
+                                        0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
+                                        0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176),
+                  (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
+                                        0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16)};
    for (int i = 0; i < 3; ++i)
    {
        std::string proto = findDataFile("dnn/" + protos[i], false);
@ -450,15 +449,7 @@ TEST(Test_Caffe, FasterRCNN_and_RFCN)
        // Output has shape 1x1xNx7 where N - number of detections.
        // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
        Mat out = net.forward();
-        out = out.reshape(1, out.total() / 7);
-
-        Mat detections;
-        for (int j = 0; j < out.rows; ++j)
-        {
-            if (out.at<float>(j, 2) > 0.8)
-              detections.push_back(out.row(j).colRange(1, 7));
-        }
-        normAssert(detections, refs[i], ("model name: " + models[i]).c_str(), 2e-4, 6e-4);
+        normAssertDetections(refs[i], out, ("model name: " + models[i]).c_str(), 0.8);
    }
 }

--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@ -57,6 +57,96 @@ inline void normAssert(cv::InputArray ref, cv::InputArray test, const char *comm
    EXPECT_LE(normInf, lInf) << comment;
 }

+static std::vector<cv::Rect2d> matToBoxes(const cv::Mat& m)
+{
+    EXPECT_EQ(m.type(), CV_32FC1);
+    EXPECT_EQ(m.dims, 2);
+    EXPECT_EQ(m.cols, 4);
+
+    std::vector<cv::Rect2d> boxes(m.rows);
+    for (int i = 0; i < m.rows; ++i)
+    {
+        CV_Assert(m.row(i).isContinuous());
+        const float* data = m.ptr<float>(i);
+        double l = data[0], t = data[1], r = data[2], b = data[3];
+        boxes[i] = cv::Rect2d(l, t, r - l, b - t);
+    }
+    return boxes;
+}
+
+inline void normAssertDetections(const std::vector<int>& refClassIds,
+                                 const std::vector<float>& refScores,
+                                 const std::vector<cv::Rect2d>& refBoxes,
+                                 const std::vector<int>& testClassIds,
+                                 const std::vector<float>& testScores,
+                                 const std::vector<cv::Rect2d>& testBoxes,
+                                 const char *comment = "", double confThreshold = 0.0,
+                                 double scores_diff = 1e-5, double boxes_iou_diff = 1e-4)
+{
+    std::vector<bool> matchedRefBoxes(refBoxes.size(), false);
+    for (int i = 0; i < testBoxes.size(); ++i)
+    {
+        double testScore = testScores[i];
+        if (testScore < confThreshold)
+            continue;
+
+        int testClassId = testClassIds[i];
+        const cv::Rect2d& testBox = testBoxes[i];
+        bool matched = false;
+        for (int j = 0; j < refBoxes.size() && !matched; ++j)
+        {
+            if (!matchedRefBoxes[j] && testClassId == refClassIds[j] &&
+                std::abs(testScore - refScores[j]) < scores_diff)
+            {
+                double interArea = (testBox & refBoxes[j]).area();
+                double iou = interArea / (testBox.area() + refBoxes[j].area() - interArea);
+                if (std::abs(iou - 1.0) < boxes_iou_diff)
+                {
+                    matched = true;
+                    matchedRefBoxes[j] = true;
+                }
+            }
+        }
+        if (!matched)
+            std::cout << cv::format("Unmatched prediction: class %d score %f box ",
+                                    testClassId, testScore) << testBox << std::endl;
+        EXPECT_TRUE(matched) << comment;
+    }
+
+    // Check unmatched reference detections.
+    for (int i = 0; i < refBoxes.size(); ++i)
+    {
+        if (!matchedRefBoxes[i] && refScores[i] > confThreshold)
+        {
+            std::cout << cv::format("Unmatched reference: class %d score %f box ",
+                                    refClassIds[i], refScores[i]) << refBoxes[i] << std::endl;
+            EXPECT_LE(refScores[i], confThreshold) << comment;
+        }
+    }
+}
+
+// For SSD-based object detection networks which produce output of shape 1x1xNx7
+// where N is a number of detections and an every detection is represented by
+// a vector [batchId, classId, confidence, left, top, right, bottom].
+inline void normAssertDetections(cv::Mat ref, cv::Mat out, const char *comment = "",
+                                 double confThreshold = 0.0, double scores_diff = 1e-5,
+                                 double boxes_iou_diff = 1e-4)
+{
+    CV_Assert(ref.total() % 7 == 0);
+    CV_Assert(out.total() % 7 == 0);
+    ref = ref.reshape(1, ref.total() / 7);
+    out = out.reshape(1, out.total() / 7);
+
+    cv::Mat refClassIds, testClassIds;
+    ref.col(1).convertTo(refClassIds, CV_32SC1);
+    out.col(1).convertTo(testClassIds, CV_32SC1);
+    std::vector<float> refScores(ref.col(2)), testScores(out.col(2));
+    std::vector<cv::Rect2d> refBoxes = matToBoxes(ref.colRange(3, 7));
+    std::vector<cv::Rect2d> testBoxes = matToBoxes(out.colRange(3, 7));
+    normAssertDetections(refClassIds, refScores, refBoxes, testClassIds, testScores,
+                         testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff);
+}
+
 inline bool readFileInMemory(const std::string& filename, std::string& content)
 {
    std::ios::openmode mode = std::ios::in | std::ios::binary;
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -42,9 +42,8 @@
 //M*/

 #include "test_precomp.hpp"
+#include "npy_blob.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
-#include <opencv2/core/ocl.hpp>
-#include <opencv2/ts/ocl_test.hpp>

 namespace opencv_test { namespace {

@ -66,238 +65,125 @@ TEST(Test_Darknet, read_yolo_voc)
    ASSERT_FALSE(net.empty());
 }

-OCL_TEST(Reproducibility_TinyYoloVoc, Accuracy)
+// Test object detection network from Darknet framework.
+static void testDarknetModel(const std::string& cfg, const std::string& weights,
+                             const std::vector<cv::String>& outNames,
+                             const std::vector<int>& refClassIds,
+                             const std::vector<float>& refConfidences,
+                             const std::vector<Rect2d>& refBoxes,
+                             int targetId, float confThreshold = 0.24)
 {
-    Net net;
-    {
-        const string cfg = findDataFile("dnn/tiny-yolo-voc.cfg", false);
-        const string model = findDataFile("dnn/tiny-yolo-voc.weights", false);
-        net = readNetFromDarknet(cfg, model);
-        ASSERT_FALSE(net.empty());
-    }
-
-    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
-    net.setPreferableTarget(DNN_TARGET_OPENCL);
-
-    // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format
    Mat sample = imread(_tf("dog416.png"));
-    ASSERT_TRUE(!sample.empty());
-
-    Size inputSize(416, 416);
-
-    if (sample.size() != inputSize)
-        resize(sample, sample, inputSize);
-
-    net.setInput(blobFromImage(sample, 1 / 255.F), "data");
-    Mat out = net.forward("detection_out");
-
-    Mat detection;
-    const float confidenceThreshold = 0.24;
-
-    for (int i = 0; i < out.rows; i++) {
-        const int probability_index = 5;
-        const int probability_size = out.cols - probability_index;
-        float *prob_array_ptr = &out.at<float>(i, probability_index);
-        size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
-        float confidence = out.at<float>(i, (int)objectClass + probability_index);
-
-        if (confidence > confidenceThreshold)
-            detection.push_back(out.row(i));
+    Mat inp = blobFromImage(sample, 1.0/255, Size(416, 416), Scalar(), true, false);
+
+    Net net = readNet(findDataFile("dnn/" + cfg, false),
+                      findDataFile("dnn/" + weights, false));
+    net.setPreferableTarget(targetId);
+    net.setInput(inp);
+    std::vector<Mat> outs;
+    net.forward(outs, outNames);
+
+    std::vector<int> classIds;
+    std::vector<float> confidences;
+    std::vector<Rect2d> boxes;
+    for (int i = 0; i < outs.size(); ++i)
+    {
+        Mat& out = outs[i];
+        for (int j = 0; j < out.rows; ++j)
+        {
+            Mat scores = out.row(j).colRange(5, out.cols);
+            double confidence;
+            Point maxLoc;
+            minMaxLoc(scores, 0, &confidence, 0, &maxLoc);
+
+            float* detection = out.ptr<float>(j);
+            double centerX = detection[0];
+            double centerY = detection[1];
+            double width = detection[2];
+            double height = detection[3];
+            boxes.push_back(Rect2d(centerX - 0.5 * width, centerY - 0.5 * height,
+                                   width, height));
+            confidences.push_back(confidence);
+            classIds.push_back(maxLoc.x);
+        }
    }
+    normAssertDetections(refClassIds, refConfidences, refBoxes, classIds,
+                         confidences, boxes, "", confThreshold, 8e-5, 3e-5);
+}

-    // obtained by: ./darknet detector test ./cfg/voc.data  ./cfg/tiny-yolo-voc.cfg ./tiny-yolo-voc.weights -thresh 0.24 ./dog416.png
-    // There are 2 objects (6-car, 11-dog) with 25 values for each:
-    // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] }
-    float ref_array[] = {
-        0.736762F, 0.239551F, 0.315440F, 0.160779F, 0.761977F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.761967F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.287486F, 0.653731F, 0.315579F, 0.534527F, 0.782737F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.780595F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F
-    };
-
-    const int number_of_objects = 2;
-    Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array);
+typedef testing::TestWithParam<DNNTarget> Test_Darknet_nets;

-    normAssert(ref, detection);
+TEST_P(Test_Darknet_nets, YoloVoc)
+{
+    int targetId = GetParam();
+    std::vector<cv::String> outNames(1, "detection_out");
+
+    std::vector<int> classIds(3);
+    std::vector<float> confidences(3);
+    std::vector<Rect2d> boxes(3);
+    classIds[0] = 6;  confidences[0] = 0.750469f; boxes[0] = Rect2d(0.577374, 0.127391, 0.325575, 0.173418);  // a car
+    classIds[1] = 1;  confidences[1] = 0.780879f; boxes[1] = Rect2d(0.270762, 0.264102, 0.461713, 0.48131); // a bycicle
+    classIds[2] = 11; confidences[2] = 0.901615f; boxes[2] = Rect2d(0.1386, 0.338509, 0.282737, 0.60028);  // a dog
+    testDarknetModel("yolo-voc.cfg", "yolo-voc.weights", outNames,
+                     classIds, confidences, boxes, targetId);
 }

-TEST(Reproducibility_TinyYoloVoc, Accuracy)
+TEST_P(Test_Darknet_nets, TinyYoloVoc)
 {
-    Net net;
-    {
-        const string cfg = findDataFile("dnn/tiny-yolo-voc.cfg", false);
-        const string model = findDataFile("dnn/tiny-yolo-voc.weights", false);
-        net = readNetFromDarknet(cfg, model);
-        ASSERT_FALSE(net.empty());
-    }
-
-    // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format
-    Mat sample = imread(_tf("dog416.png"));
-    ASSERT_TRUE(!sample.empty());
-
-    Size inputSize(416, 416);
-
-    if (sample.size() != inputSize)
-        resize(sample, sample, inputSize);
-
-    net.setInput(blobFromImage(sample, 1 / 255.F), "data");
-    Mat out = net.forward("detection_out");
-
-    Mat detection;
-    const float confidenceThreshold = 0.24;
-
-    for (int i = 0; i < out.rows; i++) {
-        const int probability_index = 5;
-        const int probability_size = out.cols - probability_index;
-        float *prob_array_ptr = &out.at<float>(i, probability_index);
-        size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
-        float confidence = out.at<float>(i, (int)objectClass + probability_index);
-
-        if (confidence > confidenceThreshold)
-            detection.push_back(out.row(i));
-    }
-
-    // obtained by: ./darknet detector test ./cfg/voc.data  ./cfg/tiny-yolo-voc.cfg ./tiny-yolo-voc.weights -thresh 0.24 ./dog416.png
-    // There are 2 objects (6-car, 11-dog) with 25 values for each:
-    // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] }
-    float ref_array[] = {
-        0.736762F, 0.239551F, 0.315440F, 0.160779F, 0.761977F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.761967F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.287486F, 0.653731F, 0.315579F, 0.534527F, 0.782737F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.780595F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F
-    };
-
-    const int number_of_objects = 2;
-    Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array);
-
-    normAssert(ref, detection);
+    int targetId = GetParam();
+    std::vector<cv::String> outNames(1, "detection_out");
+    std::vector<int> classIds(2);
+    std::vector<float> confidences(2);
+    std::vector<Rect2d> boxes(2);
+    classIds[0] = 6;  confidences[0] = 0.761967f; boxes[0] = Rect2d(0.579042, 0.159161, 0.31544, 0.160779);  // a car
+    classIds[1] = 11; confidences[1] = 0.780595f; boxes[1] = Rect2d(0.129696, 0.386467, 0.315579, 0.534527);  // a dog
+    testDarknetModel("tiny-yolo-voc.cfg", "tiny-yolo-voc.weights", outNames,
+                     classIds, confidences, boxes, targetId);
 }

-OCL_TEST(Reproducibility_YoloVoc, Accuracy)
+TEST_P(Test_Darknet_nets, YOLOv3)
 {
-    Net net;
-    {
-        const string cfg = findDataFile("dnn/yolo-voc.cfg", false);
-        const string model = findDataFile("dnn/yolo-voc.weights", false);
-        net = readNetFromDarknet(cfg, model);
-        ASSERT_FALSE(net.empty());
-    }
-
-    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
-    net.setPreferableTarget(DNN_TARGET_OPENCL);
-
-    // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format
-    Mat sample = imread(_tf("dog416.png"));
-    ASSERT_TRUE(!sample.empty());
-
-    Size inputSize(416, 416);
-
-    if (sample.size() != inputSize)
-        resize(sample, sample, inputSize);
-
-    net.setInput(blobFromImage(sample, 1 / 255.F), "data");
-    Mat out = net.forward("detection_out");
-
-    Mat detection;
-    const float confidenceThreshold = 0.24;
-
-    for (int i = 0; i < out.rows; i++) {
-        const int probability_index = 5;
-        const int probability_size = out.cols - probability_index;
-        float *prob_array_ptr = &out.at<float>(i, probability_index);
-        size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
-        float confidence = out.at<float>(i, (int)objectClass + probability_index);
-
-        if (confidence > confidenceThreshold)
-            detection.push_back(out.row(i));
-    }
-
-    // obtained by: ./darknet detector test ./cfg/voc.data  ./cfg/yolo-voc.cfg ./yolo-voc.weights -thresh 0.24 ./dog416.png
-    // There are 3 objects (6-car, 1-bicycle, 11-dog) with 25 values for each:
-    // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] }
-    float ref_array[] = {
-        0.740161F, 0.214100F, 0.325575F, 0.173418F, 0.750769F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.750469F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.501618F, 0.504757F, 0.461713F, 0.481310F, 0.783550F, 0.000000F, 0.780879F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.279968F, 0.638651F, 0.282737F, 0.600284F, 0.901864F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.901615F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F
-    };
+    int targetId = GetParam();
+    std::vector<cv::String> outNames(3);
+    outNames[0] = "yolo_82";
+    outNames[1] = "yolo_94";
+    outNames[2] = "yolo_106";
+
+    std::vector<int> classIds(3);
+    std::vector<float> confidences(3);
+    std::vector<Rect2d> boxes(3);
+    classIds[0] = 7;  confidences[0] = 0.952983f; boxes[0] = Rect2d(0.614622, 0.150257, 0.286747, 0.138994);  // a truck
+    classIds[1] = 1; confidences[1] = 0.987908f; boxes[1] = Rect2d(0.150913, 0.221933, 0.591342, 0.524327);  // a bycicle
+    classIds[2] = 16; confidences[2] = 0.998836f; boxes[2] = Rect2d(0.160024, 0.389964, 0.257861, 0.553752);  // a dog (COCO)
+    testDarknetModel("yolov3.cfg", "yolov3.weights", outNames,
+                     classIds, confidences, boxes, targetId);
+}

-    const int number_of_objects = 3;
-    Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array);
+INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets, availableDnnTargets());

-    normAssert(ref, detection);
+static void testDarknetLayer(const std::string& name, bool hasWeights = false)
+{
+    std::string cfg = findDataFile("dnn/darknet/" + name + ".cfg", false);
+    std::string model = "";
+    if (hasWeights)
+        model = findDataFile("dnn/darknet/" + name + ".weights", false);
+    Mat inp = blobFromNPY(findDataFile("dnn/darknet/" + name + "_in.npy", false));
+    Mat ref = blobFromNPY(findDataFile("dnn/darknet/" + name + "_out.npy", false));
+
+    Net net = readNet(cfg, model);
+    net.setInput(inp);
+    Mat out = net.forward();
+    normAssert(out, ref);
 }

-TEST(Reproducibility_YoloVoc, Accuracy)
+TEST(Test_Darknet, shortcut)
 {
-    Net net;
-    {
-        const string cfg = findDataFile("dnn/yolo-voc.cfg", false);
-        const string model = findDataFile("dnn/yolo-voc.weights", false);
-        net = readNetFromDarknet(cfg, model);
-        ASSERT_FALSE(net.empty());
-    }
-
-    // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format
-    Mat sample = imread(_tf("dog416.png"));
-    ASSERT_TRUE(!sample.empty());
-
-    Size inputSize(416, 416);
-
-    if (sample.size() != inputSize)
-        resize(sample, sample, inputSize);
-
-    net.setInput(blobFromImage(sample, 1 / 255.F), "data");
-    Mat out = net.forward("detection_out");
-
-    Mat detection;
-    const float confidenceThreshold = 0.24;
-
-    for (int i = 0; i < out.rows; i++) {
-        const int probability_index = 5;
-        const int probability_size = out.cols - probability_index;
-        float *prob_array_ptr = &out.at<float>(i, probability_index);
-        size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
-        float confidence = out.at<float>(i, (int)objectClass + probability_index);
-
-        if (confidence > confidenceThreshold)
-            detection.push_back(out.row(i));
-    }
-
-    // obtained by: ./darknet detector test ./cfg/voc.data  ./cfg/yolo-voc.cfg ./yolo-voc.weights -thresh 0.24 ./dog416.png
-    // There are 3 objects (6-car, 1-bicycle, 11-dog) with 25 values for each:
-    // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] }
-    float ref_array[] = {
-        0.740161F, 0.214100F, 0.325575F, 0.173418F, 0.750769F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.750469F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.501618F, 0.504757F, 0.461713F, 0.481310F, 0.783550F, 0.000000F, 0.780879F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-
-        0.279968F, 0.638651F, 0.282737F, 0.600284F, 0.901864F, 0.000000F, 0.000000F, 0.000000F, 0.000000F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.901615F,
-        0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F
-    };
-
-    const int number_of_objects = 3;
-    Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array);
+    testDarknetLayer("shortcut");
+}

-    normAssert(ref, detection);
+TEST(Test_Darknet, upsample)
+{
+    testDarknetLayer("upsample");
 }

 }} // namespace
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -173,6 +173,8 @@ TEST_P(Test_TensorFlow_layers, deconvolution)
    runTensorFlowNet("deconvolution_stride_2_same", targetId);
    runTensorFlowNet("deconvolution_adj_pad_valid", targetId);
    runTensorFlowNet("deconvolution_adj_pad_same", targetId);
+    runTensorFlowNet("keras_deconv_valid", targetId);
+    runTensorFlowNet("keras_deconv_same", targetId);
 }

 TEST_P(Test_TensorFlow_layers, matmul)
@ -237,7 +239,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)

    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
    normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
-    normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
+    normAssertDetections(target[2], output[2], "", 0.2);
 }

 TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
@ -255,21 +257,12 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
    // Output has shape 1x1xNx7 where N - number of detections.
    // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
    Mat out = net.forward();
-    out = out.reshape(1, out.total() / 7);
-
-    Mat detections;
-    for (int i = 0; i < out.rows; ++i)
-    {
-        if (out.at<float>(i, 2) > 0.5)
-          detections.push_back(out.row(i).colRange(1, 7));
-    }
-
-    Mat ref = (Mat_<float>(5, 6) << 1, 0.90176028, 0.19872092, 0.36311883, 0.26461923, 0.63498729,
-                                    3, 0.93569964, 0.64865261, 0.45906419, 0.80675775, 0.65708131,
-                                    3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
-                                    10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
-                                    10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
-    normAssert(detections, ref);
+    Mat ref = (Mat_<float>(5, 7) << 0, 1, 0.90176028, 0.19872092, 0.36311883, 0.26461923, 0.63498729,
+                                    0, 3, 0.93569964, 0.64865261, 0.45906419, 0.80675775, 0.65708131,
+                                    0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
+                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
+                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
+    normAssertDetections(ref, out, "", 0.5);
 }

 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
@ -289,13 +282,13 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
    Mat out = net.forward();

    // References are from test for Caffe model.
-    Mat ref = (Mat_<float>(6, 5) << 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
-                                    0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
-                                    0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
-                                    0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
-                                    0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
-                                    0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssert(out.reshape(1, out.total() / 7).rowRange(0, 6).colRange(2, 7), ref, "", 2.8e-4, 3.4e-3);
+    Mat ref = (Mat_<float>(6, 7) << 0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
+                                    0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
+                                    0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
+                                    0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
+                                    0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
+                                    0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
+    normAssertDetections(ref, out, "", 0.9, 3.4e-3, 1e-2);
 }

 INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
--- a/modules/imgproc/doc/pics/polar_remap_doc.png
+++ b/modules/imgproc/doc/pics/polar_remap_doc.png
--- a/modules/imgproc/doc/pics/polar_remap_doc.svg
+++ b/modules/imgproc/doc/pics/polar_remap_doc.svg
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -295,6 +295,15 @@ enum InterpolationFlags{
    WARP_INVERSE_MAP     = 16
 };

+/** \brief Specify the polar mapping mode
+@sa warpPolar
+*/
+enum WarpPolarMode
+{
+    WARP_POLAR_LINEAR = 0, ///< Remaps an image to/from polar space.
+    WARP_POLAR_LOG = 256   ///< Remaps an image to/from semilog-polar space.
+};
+
 enum InterpolationMasks {
       INTER_BITS      = 5,
       INTER_BITS2     = INTER_BITS * 2,
@ -377,7 +386,9 @@ enum GrabCutModes {
    automatically initialized with GC_BGD .*/
    GC_INIT_WITH_MASK  = 1,
    /** The value means that the algorithm should just resume. */
-    GC_EVAL            = 2
+    GC_EVAL            = 2,
+    /** The value means that the algorithm should just run the grabCut algorithm (a single iteration) with the fixed model */
+    GC_EVAL_FREEZE_MODEL = 3
 };

 //! distanceTransform algorithm flags
@ -2546,7 +2557,10 @@ An example using the cv::linearPolar and cv::logPolar operations

 /** @brief Remaps an image to semilog-polar coordinates space.

-Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image"):
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags+WARP_POLAR_LOG);
+
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image d)"):
 \f[\begin{array}{l}
  dst( \rho , \phi ) = src(x,y) \\
  dst.size() \leftarrow src.size()
@ -2556,13 +2570,13 @@ where
 \f[\begin{array}{l}
  I = (dx,dy) = (x - center.x,y - center.y) \\
  \rho = M \cdot log_e(\texttt{magnitude} (I)) ,\\
-  \phi = Ky \cdot \texttt{angle} (I)_{0..360 deg} \\
+  \phi = Kangle \cdot \texttt{angle} (I) \\
 \end{array}\f]

 and
 \f[\begin{array}{l}
  M = src.cols / log_e(maxRadius) \\
-  Ky = src.rows / 360 \\
+  Kangle = src.rows / 2\Pi \\
 \end{array}\f]

 The function emulates the human "foveal" vision and can be used for fast scale and
@ -2576,16 +2590,19 @@ rotation-invariant template matching, for object tracking and so forth.
@note
 -   The function can not operate in-place.
 -   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+
+@sa cv::linearPolar
+@endinternal
 */
 CV_EXPORTS_W void logPolar( InputArray src, OutputArray dst,
                            Point2f center, double M, int flags );

 /** @brief Remaps an image to polar coordinates space.

-@anchor polar_remaps_reference_image
-![Polar remaps reference](pics/polar_remap_doc.png)
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags)

-Transform the source image using the following transformation:
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image c)"):
 \f[\begin{array}{l}
  dst( \rho , \phi ) = src(x,y) \\
  dst.size() \leftarrow src.size()
@ -2594,14 +2611,14 @@ Transform the source image using the following transformation:
 where
 \f[\begin{array}{l}
  I = (dx,dy) = (x - center.x,y - center.y) \\
-  \rho = Kx \cdot \texttt{magnitude} (I) ,\\
-  \phi = Ky \cdot \texttt{angle} (I)_{0..360 deg}
+  \rho = Kmag \cdot \texttt{magnitude} (I) ,\\
+  \phi = angle \cdot \texttt{angle} (I)
 \end{array}\f]

 and
 \f[\begin{array}{l}
  Kx = src.cols / maxRadius \\
-  Ky = src.rows / 360
+  Ky = src.rows / 2\Pi
 \end{array}\f]


@ -2615,10 +2632,104 @@ and
 -   The function can not operate in-place.
 -   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.

+@sa cv::logPolar
+@endinternal
 */
 CV_EXPORTS_W void linearPolar( InputArray src, OutputArray dst,
                               Point2f center, double maxRadius, int flags );

+
+/** \brief Remaps an image to polar or semilog-polar coordinates space
+
+@anchor polar_remaps_reference_image
+![Polar remaps reference](pics/polar_remap_doc.png)
+
+Transform the source image using the following transformation:
+\f[
+dst(\rho , \phi ) = src(x,y)
+\f]
+
+where
+\f[
+\begin{array}{l}
+\vec{I} = (x - center.x, \;y - center.y) \\
+\phi = Kangle \cdot \texttt{angle} (\vec{I}) \\
+\rho = \left\{\begin{matrix}
+Klin \cdot \texttt{magnitude} (\vec{I}) & default \\
+Klog \cdot log_e(\texttt{magnitude} (\vec{I})) & if \; semilog \\
+\end{matrix}\right.
+\end{array}
+\f]
+
+and
+\f[
+\begin{array}{l}
+Kangle = dsize.height / 2\Pi \\
+Klin = dsize.width / maxRadius \\
+Klog = dsize.width / log_e(maxRadius) \\
+\end{array}
+\f]
+
+
+\par Linear vs semilog mapping
+
+Polar mapping can be linear or semi-log. Add one of #WarpPolarMode to `flags` to specify the polar mapping mode.
+
+Linear is the default mode.
+
+The semilog mapping emulates the human "foveal" vision that permit very high acuity on the line of sight (central vision)
+in contrast to peripheral vision where acuity is minor.
+
+\par Option on `dsize`:
+
+- if both values in `dsize <=0 ` (default),
+the destination image will have (almost) same area of source bounding circle:
+\f[\begin{array}{l}
+dsize.area  \leftarrow (maxRadius^2 \cdot \Pi) \\
+dsize.width = \texttt{cvRound}(maxRadius) \\
+dsize.height = \texttt{cvRound}(maxRadius \cdot \Pi) \\
+\end{array}\f]
+
+
+- if only `dsize.height <= 0`,
+the destination image area will be proportional to the bounding circle area but scaled by `Kx * Kx`:
+\f[\begin{array}{l}
+dsize.height = \texttt{cvRound}(dsize.width \cdot \Pi) \\
+\end{array}
+\f]
+
+- if both values in `dsize > 0 `,
+the destination image will have the given size therefore the area of the bounding circle will be scaled to `dsize`.
+
+
+\par Reverse mapping
+
+You can get reverse mapping adding #WARP_INVERSE_MAP to `flags`
+\snippet polar_transforms.cpp InverseMap
+
+In addiction, to calculate the original coordinate from a polar mapped coordinate \f$(rho, phi)->(x, y)\f$:
+\snippet polar_transforms.cpp InverseCoordinate
+
+@param src Source image.
+@param dst Destination image. It will have same type as src.
+@param dsize The destination image size (see description for valid options).
+@param center The transformation center.
+@param maxRadius The radius of the bounding circle to transform. It determines the inverse magnitude scale parameter too.
+@param flags A combination of interpolation methods, #InterpolationFlags + #WarpPolarMode.
+            - Add #WARP_POLAR_LINEAR to select linear polar mapping (default)
+            - Add #WARP_POLAR_LOG to select semilog polar mapping
+            - Add #WARP_INVERSE_MAP for reverse mapping.
+@note
+-  The function can not operate in-place.
+-  To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+-  This function uses #remap. Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+
+@sa cv::remap
+*/
+CV_EXPORTS_W void warpPolar(InputArray src, OutputArray dst, Size dsize,
+                            Point2f center, double maxRadius, int flags);
+
+
 //! @} imgproc_transform

 //! @addtogroup imgproc_misc
--- a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
@ -260,14 +260,14 @@ CVAPI(void)  cvConvertMaps( const CvArr* mapx, const CvArr* mapy,
                            CvArr* mapxy, CvArr* mapalpha );

 /** @brief Performs forward or inverse log-polar image transform
-@see cv::logPolar
+@see cv::warpPolar
 */
 CVAPI(void)  cvLogPolar( const CvArr* src, CvArr* dst,
                         CvPoint2D32f center, double M,
                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));

 /** Performs forward or inverse linear-polar image transform
-@see cv::linearPolar
+@see cv::warpPolar
 */
 CVAPI(void)  cvLinearPolar( const CvArr* src, CvArr* dst,
                         CvPoint2D32f center, double maxRadius,
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@ -204,7 +204,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
    const RemapParams params = GetParam();
    const Size srcSize = get<0>(params);
    const int type = get<1>(params), interpolation = get<2>(params), borderMode = BORDER_CONSTANT;
-    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+    //const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;

    checkDeviceMaxMemoryAllocSize(srcSize, type);

--- a/modules/imgproc/src/color_hsv.cpp
+++ b/modules/imgproc/src/color_hsv.cpp
@ -523,62 +523,38 @@ struct RGB2HLS_f

    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
    : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) {
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #if CV_SIMD128
+        hasSIMD = hasSIMD128();
        #endif
    }

-    #if CV_SSE2
-    void process(__m128& v_b0, __m128& v_b1, __m128& v_g0,
-                 __m128& v_g1, __m128& v_r0, __m128& v_r1) const
+    #if CV_SIMD128
+    inline void process(v_float32x4& v_r, v_float32x4& v_g,
+                        v_float32x4& v_b, v_float32x4& v_hscale) const
    {
-        __m128 v_max0 = _mm_max_ps(_mm_max_ps(v_b0, v_g0), v_r0);
-        __m128 v_max1 = _mm_max_ps(_mm_max_ps(v_b1, v_g1), v_r1);
-        __m128 v_min0 = _mm_min_ps(_mm_min_ps(v_b0, v_g0), v_r0);
-        __m128 v_min1 = _mm_min_ps(_mm_min_ps(v_b1, v_g1), v_r1);
-        __m128 v_diff0 = _mm_sub_ps(v_max0, v_min0);
-        __m128 v_diff1 = _mm_sub_ps(v_max1, v_min1);
-        __m128 v_sum0 = _mm_add_ps(v_max0, v_min0);
-        __m128 v_sum1 = _mm_add_ps(v_max1, v_min1);
-        __m128 v_l0 = _mm_mul_ps(v_sum0, _mm_set1_ps(0.5f));
-        __m128 v_l1 = _mm_mul_ps(v_sum1, _mm_set1_ps(0.5f));
-
-        __m128 v_gel0 = _mm_cmpge_ps(v_l0, _mm_set1_ps(0.5f));
-        __m128 v_gel1 = _mm_cmpge_ps(v_l1, _mm_set1_ps(0.5f));
-        __m128 v_s0 = _mm_and_ps(v_gel0, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum0));
-        __m128 v_s1 = _mm_and_ps(v_gel1, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum1));
-        v_s0 = _mm_or_ps(v_s0, _mm_andnot_ps(v_gel0, v_sum0));
-        v_s1 = _mm_or_ps(v_s1, _mm_andnot_ps(v_gel1, v_sum1));
-        v_s0 = _mm_div_ps(v_diff0, v_s0);
-        v_s1 = _mm_div_ps(v_diff1, v_s1);
-
-        __m128 v_gteps0 = _mm_cmpgt_ps(v_diff0, _mm_set1_ps(FLT_EPSILON));
-        __m128 v_gteps1 = _mm_cmpgt_ps(v_diff1, _mm_set1_ps(FLT_EPSILON));
-
-        v_diff0 = _mm_div_ps(_mm_set1_ps(60.f), v_diff0);
-        v_diff1 = _mm_div_ps(_mm_set1_ps(60.f), v_diff1);
-
-        __m128 v_eqr0 = _mm_cmpeq_ps(v_max0, v_r0);
-        __m128 v_eqr1 = _mm_cmpeq_ps(v_max1, v_r1);
-        __m128 v_h0 = _mm_and_ps(v_eqr0, _mm_mul_ps(_mm_sub_ps(v_g0, v_b0), v_diff0));
-        __m128 v_h1 = _mm_and_ps(v_eqr1, _mm_mul_ps(_mm_sub_ps(v_g1, v_b1), v_diff1));
-        __m128 v_eqg0 = _mm_cmpeq_ps(v_max0, v_g0);
-        __m128 v_eqg1 = _mm_cmpeq_ps(v_max1, v_g1);
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(_mm_andnot_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b0, v_r0), v_diff0), _mm_set1_ps(120.f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(_mm_andnot_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b1, v_r1), v_diff1), _mm_set1_ps(120.f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_andnot_ps(_mm_or_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r0, v_g0), v_diff0), _mm_set1_ps(240.f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_andnot_ps(_mm_or_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r1, v_g1), v_diff1), _mm_set1_ps(240.f))));
-        v_h0 = _mm_add_ps(v_h0, _mm_and_ps(_mm_cmplt_ps(v_h0, _mm_setzero_ps()), _mm_set1_ps(360.f)));
-        v_h1 = _mm_add_ps(v_h1, _mm_and_ps(_mm_cmplt_ps(v_h1, _mm_setzero_ps()), _mm_set1_ps(360.f)));
-        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
-        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
-
-        v_b0 = _mm_and_ps(v_gteps0, v_h0);
-        v_b1 = _mm_and_ps(v_gteps1, v_h1);
-        v_g0 = v_l0;
-        v_g1 = v_l1;
-        v_r0 = _mm_and_ps(v_gteps0, v_s0);
-        v_r1 = _mm_and_ps(v_gteps1, v_s1);
+        v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
+        v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
+
+        v_float32x4 v_diff = v_max_rgb - v_min_rgb;
+        v_float32x4 v_sum = v_max_rgb + v_min_rgb;
+        v_float32x4 v_half = v_setall_f32(0.5f);
+        v_float32x4 v_l = v_sum * v_half;
+
+        v_float32x4 v_s = v_diff / v_select(v_l < v_half, v_sum, v_setall_f32(2.0f) - v_sum);
+
+        v_float32x4 v_r_eq_max = v_max_rgb == v_r;
+        v_float32x4 v_g_eq_max = v_max_rgb == v_g;
+        v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
+                          v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
+        v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
+                            v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
+        v_float32x4 v_rev_diff = v_setall_f32(60.0f) / v_diff;
+        v_h = v_muladd(v_h, v_rev_diff, v_res) * v_hscale;
+
+        v_float32x4 v_diff_gt_eps = v_diff > v_setall_f32(FLT_EPSILON);
+        v_r = v_diff_gt_eps & v_h;
+        v_g = v_l;
+        v_b = v_diff_gt_eps & v_s;
    }
    #endif

@ -587,49 +563,56 @@ struct RGB2HLS_f
        int i = 0, bidx = blueIdx, scn = srccn;
        n *= 3;

-        #if CV_SSE2
-        if (haveSIMD)
+        #if CV_SIMD128
+        if (hasSIMD)
        {
-            for( ; i <= n - 24; i += 24, src += scn * 8 )
-            {
-                __m128 v_b0 = _mm_loadu_ps(src +  0);
-                __m128 v_b1 = _mm_loadu_ps(src +  4);
-                __m128 v_g0 = _mm_loadu_ps(src +  8);
-                __m128 v_g1 = _mm_loadu_ps(src + 12);
-                __m128 v_r0 = _mm_loadu_ps(src + 16);
-                __m128 v_r1 = _mm_loadu_ps(src + 20);
-
-                if (scn == 3)
-                {
-                    _mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
-                }
-                else
-                {
-                    __m128 v_a0 = _mm_loadu_ps(src + 24);
-                    __m128 v_a1 = _mm_loadu_ps(src + 28);
-                    _mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1, v_a0, v_a1);
+            v_float32x4 v_hscale = v_setall_f32(hscale);
+            if (scn == 3) {
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_r, v_g, v_b, v_hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_b, v_g, v_r, v_hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
                }
-
-                if (bidx)
-                {
-                    __m128 v_tmp0 = v_b0;
-                    __m128 v_tmp1 = v_b1;
-                    v_b0 = v_r0;
-                    v_b1 = v_r1;
-                    v_r0 = v_tmp0;
-                    v_r1 = v_tmp1;
+            } else { // scn == 4
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_r, v_g, v_b, v_hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_b, v_g, v_r, v_hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
                }
-
-                process(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
-
-                _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
-
-                _mm_storeu_ps(dst + i +  0, v_b0);
-                _mm_storeu_ps(dst + i +  4, v_b1);
-                _mm_storeu_ps(dst + i +  8, v_g0);
-                _mm_storeu_ps(dst + i + 12, v_g1);
-                _mm_storeu_ps(dst + i + 16, v_r0);
-                _mm_storeu_ps(dst + i + 20, v_r1);
            }
        }
        #endif
@ -672,8 +655,8 @@ struct RGB2HLS_f

    int srccn, blueIdx;
    float hscale;
-    #if CV_SSE2
-    bool haveSIMD;
+    #if CV_SIMD128
+    bool hasSIMD;
    #endif
 };

--- a/modules/imgproc/src/grabcut.cpp
+++ b/modules/imgproc/src/grabcut.cpp
@ -557,7 +557,10 @@ void cv::grabCut( InputArray _img, InputOutputArray _mask, Rect rect,
    if( iterCount <= 0)
        return;

-    if( mode == GC_EVAL )
+    if( mode == GC_EVAL_FREEZE_MODEL )
+        iterCount = 1;
+
+    if( mode == GC_EVAL || mode == GC_EVAL_FREEZE_MODEL )
        checkMask( img, mask );

    const double gamma = 50;
@ -571,7 +574,8 @@ void cv::grabCut( InputArray _img, InputOutputArray _mask, Rect rect,
    {
        GCGraph<double> graph;
        assignGMMsComponents( img, mask, bgdGMM, fgdGMM, compIdxs );
-        learnGMMs( img, mask, compIdxs, bgdGMM, fgdGMM );
+        if( mode != GC_EVAL_FREEZE_MODEL )
+            learnGMMs( img, mask, compIdxs, bgdGMM, fgdGMM );
        constructGCGraph(img, mask, bgdGMM, fgdGMM, lambda, leftW, upleftW, upW, uprightW, graph );
        estimateSegmentation( graph, mask );
    }
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1377,6 +1377,10 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
    return k.run(2, globalThreads, NULL, false);
 }

+#if 0
+/**
+@deprecated with old version of cv::linearPolar
+*/
 static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
    Point2f center, double maxRadius, int flags)
 {
@ -1517,6 +1521,8 @@ static bool ocl_logPolar(InputArray _src, OutputArray _dst,
 }
 #endif

+#endif
+
 #ifdef HAVE_OPENVX
 static bool openvx_remap(Mat src, Mat dst, Mat map1, Mat map2, int interpolation, const Scalar& borderValue)
 {
@ -3252,191 +3258,86 @@ cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dsta
    cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
 }

-/****************************************************************************************\
-*                                   Log-Polar Transform                                  *
-\****************************************************************************************/
-
-/* now it is done via Remap; more correct implementation should use
-   some super-sampling technique outside of the "fovea" circle */
-CV_IMPL void
-cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
-            CvPoint2D32f center, double M, int flags )
+/****************************************************************************************
+PkLab.net 2018 based on cv::linearPolar from OpenCV by J.L. Blanco, Apr 2009
+****************************************************************************************/
+void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
+                   Point2f center, double maxRadius, int flags)
 {
-    Mat src_with_border; // don't scope this variable (it holds image data)
-
-    cv::Ptr<CvMat> mapx, mapy;
-
-    CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
-    CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
-    CvSize dsize;
-
-    if( !CV_ARE_TYPES_EQ( src, dst ))
-        CV_Error( CV_StsUnmatchedFormats, "" );
-
-    if( M <= 0 )
-        CV_Error( CV_StsOutOfRange, "M should be >0" );
-
-    dsize = cvGetMatSize(dst);
-
-    mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
-    mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
-
-    if( !(flags & CV_WARP_INVERSE_MAP) )
+    // if dest size is empty given than calculate using proportional setting
+    // thus we calculate needed angles to keep same area as bounding circle
+    if ((dsize.width <= 0) && (dsize.height <= 0))
    {
-        int phi, rho;
-        cv::AutoBuffer<double> _exp_tab(dsize.width);
-        double* exp_tab = _exp_tab;
-
-        for( rho = 0; rho < dst->width; rho++ )
-            exp_tab[rho] = std::exp(rho/M) - 1.0;
-
-        for( phi = 0; phi < dsize.height; phi++ )
-        {
-            double cp = cos(phi*2*CV_PI/dsize.height);
-            double sp = sin(phi*2*CV_PI/dsize.height);
-            float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
-            float* my = (float*)(mapy->data.ptr + phi*mapy->step);
-
-            for( rho = 0; rho < dsize.width; rho++ )
-            {
-                double r = exp_tab[rho];
-                double x = r*cp + center.x;
-                double y = r*sp + center.y;
-
-                mx[rho] = (float)x;
-                my[rho] = (float)y;
-            }
-        }
+        dsize.width = cvRound(maxRadius);
+        dsize.height = cvRound(maxRadius * CV_PI);
    }
-    else
+    else if (dsize.height <= 0)
    {
-        const int ANGLE_BORDER = 1;
-        Mat src_ = cv::cvarrToMat(src);
-        cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
-        srcstub = src_with_border; src = &srcstub;
-        CvSize ssize = cvGetMatSize(src);
-        ssize.height -= 2*ANGLE_BORDER;
-
-        int x, y;
-        CvMat bufx, bufy, bufp, bufa;
-        double ascale = ssize.height/(2*CV_PI);
-        cv::AutoBuffer<float> _buf(4*dsize.width);
-        float* buf = _buf;
-
-        bufx = cvMat( 1, dsize.width, CV_32F, buf );
-        bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
-        bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
-        bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
-
-        for( x = 0; x < dsize.width; x++ )
-            bufx.data.fl[x] = (float)x - center.x;
-
-        for( y = 0; y < dsize.height; y++ )
-        {
-            float* mx = (float*)(mapx->data.ptr + y*mapx->step);
-            float* my = (float*)(mapy->data.ptr + y*mapy->step);
-
-            for( x = 0; x < dsize.width; x++ )
-                bufy.data.fl[x] = (float)y - center.y;
-
-#if 1
-            cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
-
-            for( x = 0; x < dsize.width; x++ )
-                bufp.data.fl[x] += 1.f;
-
-            cvLog( &bufp, &bufp );
-
-            for( x = 0; x < dsize.width; x++ )
-            {
-                double rho = bufp.data.fl[x]*M;
-                double phi = bufa.data.fl[x]*ascale;
-
-                mx[x] = (float)rho;
-                my[x] = (float)phi + ANGLE_BORDER;
-            }
-#else
-            for( x = 0; x < dsize.width; x++ )
-            {
-                double xx = bufx.data.fl[x];
-                double yy = bufy.data.fl[x];
-
-                double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
-                double a = atan2(yy,xx);
-                if( a < 0 )
-                    a = 2*CV_PI + a;
-                a *= ascale;
-
-                mx[x] = (float)p;
-                my[x] = (float)a;
-            }
-#endif
-        }
+        dsize.height = cvRound(dsize.width * CV_PI);
    }

-    cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
-}
-
-void cv::logPolar( InputArray _src, OutputArray _dst,
-                   Point2f center, double M, int flags )
-{
-    CV_INSTRUMENT_REGION()
-
-    CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
-        ocl_logPolar(_src, _dst, center, M, flags));
-    Mat src_with_border; // don't scope this variable (it holds image data)
-
    Mat mapx, mapy;
-
-    Mat srcstub, src = _src.getMat();
-    _dst.create(src.size(), src.type());
-    Size dsize = src.size();
-
-    if (M <= 0)
-        CV_Error(CV_StsOutOfRange, "M should be >0");
-
-
    mapx.create(dsize, CV_32F);
    mapy.create(dsize, CV_32F);
+    bool semiLog = (flags & WARP_POLAR_LOG) != 0;

    if (!(flags & CV_WARP_INVERSE_MAP))
    {
+        double Kangle = CV_2PI / dsize.height;
        int phi, rho;
-        cv::AutoBuffer<double> _exp_tab(dsize.width);
-        double* exp_tab = _exp_tab;

-        for (rho = 0; rho < dsize.width; rho++)
-            exp_tab[rho] = std::exp(rho / M) - 1.0;
+        // precalculate scaled rho
+        Mat rhos = Mat(1, dsize.width, CV_32F);
+        float* bufRhos = (float*)(rhos.data);
+        if (semiLog)
+        {
+            double Kmag = std::log(maxRadius) / dsize.width;
+            for (rho = 0; rho < dsize.width; rho++)
+                bufRhos[rho] = (float)(std::exp(rho * Kmag) - 1.0);
+
+        }
+        else
+        {
+            double Kmag = maxRadius / dsize.width;
+            for (rho = 0; rho < dsize.width; rho++)
+                bufRhos[rho] = (float)(rho * Kmag);
+        }

        for (phi = 0; phi < dsize.height; phi++)
        {
-            double cp = std::cos(phi * 2 * CV_PI / dsize.height);
-            double sp = std::sin(phi * 2 * CV_PI / dsize.height);
+            double KKy = Kangle * phi;
+            double cp = std::cos(KKy);
+            double sp = std::sin(KKy);
            float* mx = (float*)(mapx.data + phi*mapx.step);
            float* my = (float*)(mapy.data + phi*mapy.step);

            for (rho = 0; rho < dsize.width; rho++)
            {
-                double r = exp_tab[rho];
-                double x = r*cp + center.x;
-                double y = r*sp + center.y;
+                double x = bufRhos[rho] * cp + center.x;
+                double y = bufRhos[rho] * sp + center.y;

                mx[rho] = (float)x;
                my[rho] = (float)y;
            }
        }
+        remap(_src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
    }
    else
    {
        const int ANGLE_BORDER = 1;
-        cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
-        srcstub = src_with_border; src = srcstub;
-        Size ssize = src.size();
+        cv::copyMakeBorder(_src, _dst, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
+        Mat src = _dst.getMat();
+        Size ssize = _dst.size();
        ssize.height -= 2 * ANGLE_BORDER;
+        const double Kangle = CV_2PI / ssize.height;
+        double Kmag;
+        if (semiLog)
+            Kmag = std::log(maxRadius) / ssize.width;
+        else
+            Kmag = maxRadius / ssize.width;

        int x, y;
        Mat bufx, bufy, bufp, bufa;
-        double ascale = ssize.height / (2 * CV_PI);

        bufx = Mat(1, dsize.width, CV_32F);
        bufy = Mat(1, dsize.width, CV_32F);
@ -3454,225 +3355,65 @@ void cv::logPolar( InputArray _src, OutputArray _dst,
            for (x = 0; x < dsize.width; x++)
                bufy.at<float>(0, x) = (float)y - center.y;

-#if 1
-            cartToPolar(bufx, bufy, bufp, bufa);
-
-            for (x = 0; x < dsize.width; x++)
-                bufp.at<float>(0, x) += 1.f;
-
-            log(bufp, bufp);
-
-            for (x = 0; x < dsize.width; x++)
-            {
-                double rho = bufp.at<float>(0, x) * M;
-                double phi = bufa.at<float>(0, x) * ascale;
-
-                mx[x] = (float)rho;
-                my[x] = (float)phi + ANGLE_BORDER;
-            }
-#else
-            for (x = 0; x < dsize.width; x++)
-            {
-                double xx = bufx.at<float>(0, x);
-                double yy = bufy.at<float>(0, x);
-                double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
-                double a = atan2(yy, xx);
-                if (a < 0)
-                    a = 2 * CV_PI + a;
-                a *= ascale;
-                mx[x] = (float)p;
-                my[x] = (float)a;
-            }
-#endif
-        }
-    }
-
-    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
-        (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
-}
-
-/****************************************************************************************
-                                   Linear-Polar Transform
-  J.L. Blanco, Apr 2009
- ****************************************************************************************/
-CV_IMPL
-void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
-            CvPoint2D32f center, double maxRadius, int flags )
-{
-    Mat src_with_border; // don't scope this variable (it holds image data)
-
-    cv::Ptr<CvMat> mapx, mapy;
-
-    CvMat srcstub, *src = (CvMat*)srcarr;
-    CvMat dststub, *dst = (CvMat*)dstarr;
-    CvSize dsize;
-
-    src = cvGetMat( srcarr, &srcstub,0,0 );
-    dst = cvGetMat( dstarr, &dststub,0,0 );
-
-    if( !CV_ARE_TYPES_EQ( src, dst ))
-        CV_Error( CV_StsUnmatchedFormats, "" );
-
-    dsize = cvGetMatSize(dst);
-
-    mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
-    mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
-
-    if( !(flags & CV_WARP_INVERSE_MAP) )
-    {
-        int phi, rho;
-
-        for( phi = 0; phi < dsize.height; phi++ )
-        {
-            double cp = cos(phi*2*CV_PI/dsize.height);
-            double sp = sin(phi*2*CV_PI/dsize.height);
-            float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
-            float* my = (float*)(mapy->data.ptr + phi*mapy->step);
+            cartToPolar(bufx, bufy, bufp, bufa, 0);

-            for( rho = 0; rho < dsize.width; rho++ )
+            if (semiLog)
            {
-                double r = maxRadius*rho/dsize.width;
-                double x = r*cp + center.x;
-                double y = r*sp + center.y;
-
-                mx[rho] = (float)x;
-                my[rho] = (float)y;
+                bufp += 1.f;
+                log(bufp, bufp);
            }
-        }
-    }
-    else
-    {
-        const int ANGLE_BORDER = 1;
-        Mat src_ = cv::cvarrToMat(src);
-        cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
-        srcstub = src_with_border; src = &srcstub;
-        CvSize ssize = cvGetMatSize(src);
-        ssize.height -= 2*ANGLE_BORDER;
-
-        int x, y;
-        CvMat bufx, bufy, bufp, bufa;
-        const double ascale = ssize.height/(2*CV_PI);
-        const double pscale = ssize.width/maxRadius;
-
-        cv::AutoBuffer<float> _buf(4*dsize.width);
-        float* buf = _buf;
-
-        bufx = cvMat( 1, dsize.width, CV_32F, buf );
-        bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
-        bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
-        bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
-
-        for( x = 0; x < dsize.width; x++ )
-            bufx.data.fl[x] = (float)x - center.x;
-
-        for( y = 0; y < dsize.height; y++ )
-        {
-            float* mx = (float*)(mapx->data.ptr + y*mapx->step);
-            float* my = (float*)(mapy->data.ptr + y*mapy->step);
-
-            for( x = 0; x < dsize.width; x++ )
-                bufy.data.fl[x] = (float)y - center.y;

-            cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
-
-            for( x = 0; x < dsize.width; x++ )
+            for (x = 0; x < dsize.width; x++)
            {
-                double rho = bufp.data.fl[x]*pscale;
-                double phi = bufa.data.fl[x]*ascale;
+                double rho = bufp.at<float>(0, x) / Kmag;
+                double phi = bufa.at<float>(0, x) / Kangle;
                mx[x] = (float)rho;
                my[x] = (float)phi + ANGLE_BORDER;
            }
        }
+        remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
+              (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
    }
-
-    cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
 }

 void cv::linearPolar( InputArray _src, OutputArray _dst,
                      Point2f center, double maxRadius, int flags )
 {
-    CV_INSTRUMENT_REGION()
-
-    CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
-        ocl_linearPolar(_src, _dst, center, maxRadius, flags));
-    Mat src_with_border; // don't scope this variable (it holds image data)
-
-    Mat mapx, mapy;
-    Mat srcstub, src = _src.getMat();
-    _dst.create(src.size(), src.type());
-    Size dsize = src.size();
-
-
-    mapx.create(dsize, CV_32F);
-    mapy.create(dsize, CV_32F);
-
-    if (!(flags & CV_WARP_INVERSE_MAP))
-    {
-        int phi, rho;
-
-        for (phi = 0; phi < dsize.height; phi++)
-        {
-            double cp = std::cos(phi * 2 * CV_PI / dsize.height);
-            double sp = std::sin(phi * 2 * CV_PI / dsize.height);
-            float* mx = (float*)(mapx.data + phi*mapx.step);
-            float* my = (float*)(mapy.data + phi*mapy.step);
-
-            for (rho = 0; rho < dsize.width; rho++)
-            {
-                double r = maxRadius*rho / dsize.width;
-                double x = r*cp + center.x;
-                double y = r*sp + center.y;
-
-                mx[rho] = (float)x;
-                my[rho] = (float)y;
-            }
-        }
-    }
-    else
-    {
-        const int ANGLE_BORDER = 1;
-
-        cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
-        src = src_with_border;
-        Size ssize = src_with_border.size();
-        ssize.height -= 2 * ANGLE_BORDER;
-
-        int x, y;
-        Mat bufx, bufy, bufp, bufa;
-        const double ascale = ssize.height / (2 * CV_PI);
-        const double pscale = ssize.width / maxRadius;
-
-
+    warpPolar(_src, _dst, _src.size(), center, maxRadius, flags & ~WARP_POLAR_LOG);
+}

-        bufx = Mat(1, dsize.width, CV_32F);
-        bufy = Mat(1, dsize.width, CV_32F);
-        bufp = Mat(1, dsize.width, CV_32F);
-        bufa = Mat(1, dsize.width, CV_32F);
+void cv::logPolar( InputArray _src, OutputArray _dst,
+                   Point2f center, double maxRadius, int flags )
+{
+    Size ssize = _src.size();
+    double M = maxRadius > 0 ? std::exp(ssize.width / maxRadius) : 1;
+    warpPolar(_src, _dst, ssize, center, M, flags | WARP_POLAR_LOG);
+}

-        for (x = 0; x < dsize.width; x++)
-            bufx.at<float>(0, x) = (float)x - center.x;
+CV_IMPL
+void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
+                    CvPoint2D32f center, double maxRadius, int flags )
+{
+    Mat src = cvarrToMat(srcarr);
+    Mat dst = cvarrToMat(dstarr);

-        for (y = 0; y < dsize.height; y++)
-        {
-            float* mx = (float*)(mapx.data + y*mapx.step);
-            float* my = (float*)(mapy.data + y*mapy.step);
+    CV_Assert(src.size == dst.size);
+    CV_Assert(src.type() == dst.type());

-            for (x = 0; x < dsize.width; x++)
-                bufy.at<float>(0, x) = (float)y - center.y;
+    cv::linearPolar(src, dst, center, maxRadius, flags);
+}

-            cartToPolar(bufx, bufy, bufp, bufa, 0);
+CV_IMPL
+void cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
+                 CvPoint2D32f center, double M, int flags )
+{
+    Mat src = cvarrToMat(srcarr);
+    Mat dst = cvarrToMat(dstarr);

-            for (x = 0; x < dsize.width; x++)
-            {
-                double rho = bufp.at<float>(0, x) * pscale;
-                double phi = bufa.at<float>(0, x) * ascale;
-                mx[x] = (float)rho;
-                my[x] = (float)phi + ANGLE_BORDER;
-            }
-        }
-    }
+    CV_Assert(src.size == dst.size);
+    CV_Assert(src.type() == dst.type());

-    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
+    cv::logPolar(src, dst, center, M, flags);
 }

 /* End of file. */
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@ -1148,6 +1148,11 @@ static bool ippMorph(int op, int src_type, int dst_type,
    // Different mask flipping
    if(op == MORPH_GRADIENT)
        return false;
+
+    // Integer overflow bug
+    if(src_step >= IPP_MAX_32S ||
+       src_step*height >= IPP_MAX_32S)
+        return false;
 #endif

 #if IPP_VERSION_X100 < 201801
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@ -4003,7 +4003,8 @@ static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 #endif

 #ifdef HAVE_IPP
-#if IPP_VERSION_X100 == 201702  // IW 2017u2 has bug which doesn't allow use of partial inMem with tiling
+// IW 2017u2 has bug which doesn't allow use of partial inMem with tiling
+#if IPP_DISABLE_GAUSSIANBLUR_PARALLEL
 #define IPP_GAUSSIANBLUR_PARALLEL 0
 #else
 #define IPP_GAUSSIANBLUR_PARALLEL 1
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@ -103,6 +103,7 @@ PARAM_TEST_CASE(CvtColor, MatDepth, bool)
            {
            case COLOR_RGB2HLS: case COLOR_BGR2HLS:
                h_limit = 180;
+                /* fallthrough */
            case COLOR_RGB2HLS_FULL: case COLOR_BGR2HLS_FULL:
            {
                ASSERT_EQ(dst_roi.type(), udst_roi.type());
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@ -304,14 +304,9 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool, int)
        UMAT_UPLOAD_INPUT_PARAMETER(src);
        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
-
-    void Near(double threshold = 0.0)
-    {
-        OCL_EXPECT_MATS_NEAR(dst, threshold);
-    }
 };

-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm__)
 const int integerEps = 3;
 #else
 const int integerEps = 1;
@ -328,7 +323,7 @@ OCL_TEST_P(Resize, Mat)
        OCL_OFF(cv::resize(src_roi, dst_roi, Size(), fx, fy, interpolation));
        OCL_ON(cv::resize(usrc_roi, udst_roi, Size(), fx, fy, interpolation));

-        Near(eps);
+        OCL_EXPECT_MAT_N_DIFF(dst, eps);
    }
 }

@ -388,11 +383,6 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy
        if (noType != map2Type)
            UMAT_UPLOAD_INPUT_PARAMETER(map2);
    }
-
-    void Near(double threshold = 0.0)
-    {
-        OCL_EXPECT_MATS_NEAR(dst, threshold);
-    }
 };

 typedef Remap Remap_INTER_NEAREST;
@ -406,7 +396,7 @@ OCL_TEST_P(Remap_INTER_NEAREST, Mat)
        OCL_OFF(cv::remap(src_roi, dst_roi, map1_roi, map2_roi, INTER_NEAREST, borderType, val));
        OCL_ON(cv::remap(usrc_roi, udst_roi, umap1_roi, umap2_roi, INTER_NEAREST, borderType, val));

-        Near(1.0);
+        OCL_EXPECT_MAT_N_DIFF(dst, 1.0);
    }
 }

@ -423,12 +413,14 @@ OCL_TEST_P(Remap_INTER_LINEAR, Mat)
        // TODO investigate accuracy
        if (cv::ocl::Device::getDefault().isNVidia())
            eps = 8.0;
+#elif defined(__arm__)
+        eps = 8.0;
 #endif

        OCL_OFF(cv::remap(src_roi, dst_roi, map1_roi, map2_roi, INTER_LINEAR, borderType, val));
        OCL_ON(cv::remap(usrc_roi, udst_roi, umap1_roi, umap2_roi, INTER_LINEAR, borderType, val));

-        Near(eps);
+        OCL_EXPECT_MAT_N_DIFF(dst, eps);
    }
 }

--- a/modules/imgproc/test/test_grabcut.cpp
+++ b/modules/imgproc/test/test_grabcut.cpp
@ -92,7 +92,9 @@ void CV_GrabcutTest::run( int /* start_from */)
    mask = Scalar(0);
    Mat bgdModel, fgdModel;
    grabCut( img, mask, rect, bgdModel, fgdModel, 0, GC_INIT_WITH_RECT );
-    grabCut( img, mask, rect, bgdModel, fgdModel, 2, GC_EVAL );
+    bgdModel.copyTo(exp_bgdModel);
+    fgdModel.copyTo(exp_fgdModel);
+    grabCut( img, mask, rect, bgdModel, fgdModel, 2, GC_EVAL_FREEZE_MODEL );

    // Multiply images by 255 for more visuality of test data.
    if( mask_prob.empty() )
@ -105,12 +107,20 @@ void CV_GrabcutTest::run( int /* start_from */)
        exp_mask1 = (mask & 1) * 255;
        imwrite(string(ts->get_data_path()) + "grabcut/exp_mask1.png", exp_mask1);
    }
-
    if (!verify((mask & 1) * 255, exp_mask1))
    {
        ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
        return;
    }
+    // The model should not be changed after calling with GC_EVAL_FREEZE_MODEL
+    double sumBgdModel = cv::sum(cv::abs(bgdModel) - cv::abs(exp_bgdModel))[0];
+    double sumFgdModel = cv::sum(cv::abs(fgdModel) - cv::abs(exp_fgdModel))[0];
+    if (sumBgdModel >= 0.1 || sumFgdModel >= 0.1)
+    {
+        ts->printf(cvtest::TS::LOG, "sumBgdModel = %f, sumFgdModel = %f\n", sumBgdModel, sumFgdModel);
+        ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
+        return;
+    }

    mask = mask_prob;
    bgdModel.release();
@ -124,7 +134,6 @@ void CV_GrabcutTest::run( int /* start_from */)
        exp_mask2 = (mask & 1) * 255;
        imwrite(string(ts->get_data_path()) + "grabcut/exp_mask2.png", exp_mask2);
    }
-
    if (!verify((mask & 1) * 255, exp_mask2))
    {
        ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@ -1781,7 +1781,7 @@ TEST(Imgproc_Remap, DISABLED_memleak)
    }
 }

-
+//** @deprecated */
 TEST(Imgproc_linearPolar, identity)
 {
    const int N = 33;
@ -1821,7 +1821,7 @@ TEST(Imgproc_linearPolar, identity)
 #endif
 }

-
+//** @deprecated */
 TEST(Imgproc_logPolar, identity)
 {
    const int N = 33;
@ -1862,6 +1862,52 @@ TEST(Imgproc_logPolar, identity)
 #endif
 }

+TEST(Imgproc_warpPolar, identity)
+{
+    const int N = 33;
+    Mat in(N, N, CV_8UC3, Scalar(255, 0, 0));
+    in(cv::Rect(N / 3, N / 3, N / 3, N / 3)).setTo(Scalar::all(255));
+    cv::blur(in, in, Size(5, 5));
+    cv::blur(in, in, Size(5, 5));
+
+    Mat src = in.clone();
+    Mat dst;
+
+    Rect roi = Rect(0, 0, in.cols - ((N + 19) / 20), in.rows);
+    Point2f center = Point2f((N - 1) * 0.5f, (N - 1) * 0.5f);
+    double radius = N * 0.5;
+    int flags = CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR;
+    // test linearPolar
+    for (int ki = 1; ki <= 5; ki++)
+    {
+        warpPolar(src, dst, src.size(), center, radius, flags + WARP_POLAR_LINEAR + CV_WARP_INVERSE_MAP);
+        warpPolar(dst, src, src.size(), center, radius, flags + WARP_POLAR_LINEAR);
+
+        double psnr = cv::PSNR(in(roi), src(roi));
+        EXPECT_LE(25, psnr) << "iteration=" << ki;
+    }
+    // test logPolar
+    src = in.clone();
+    for (int ki = 1; ki <= 5; ki++)
+    {
+        warpPolar(src, dst, src.size(),center, radius, flags + WARP_POLAR_LOG + CV_WARP_INVERSE_MAP );
+        warpPolar(dst, src, src.size(),center, radius, flags + WARP_POLAR_LOG);
+
+        double psnr = cv::PSNR(in(roi), src(roi));
+        EXPECT_LE(25, psnr) << "iteration=" << ki;
+    }
+
+#if 0
+    Mat all(N*2+2,N*2+2, src.type(), Scalar(0,0,255));
+    in.copyTo(all(Rect(0,0,N,N)));
+    src.copyTo(all(Rect(0,N+1,N,N)));
+    src.copyTo(all(Rect(N+1,0,N,N)));
+    dst.copyTo(all(Rect(N+1,N+1,N,N)));
+    imwrite("linearPolar.png", all);
+    imshow("input", in); imshow("result", dst); imshow("restore", src); imshow("all", all);
+    cv::waitKey();
+#endif
+}

 }} // namespace
 /* End of file. */
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -27,6 +27,7 @@

 #include "pycompat.hpp"

+#include <map>

 static PyObject* opencv_error = 0;

@ -1621,13 +1622,19 @@ static PyObject *pycvSetMouseCallback(PyObject*, PyObject *args, PyObject *kw)
    if (param == NULL) {
        param = Py_None;
    }
-    static PyObject* last_param = NULL;
-    if (last_param) {
-        Py_DECREF(last_param);
-        last_param = NULL;
+    PyObject* py_callback_info = Py_BuildValue("OO", on_mouse, param);
+    static std::map<std::string, PyObject*> registered_callbacks;
+    std::map<std::string, PyObject*>::iterator i = registered_callbacks.find(name);
+    if (i != registered_callbacks.end())
+    {
+        Py_DECREF(i->second);
+        i->second = py_callback_info;
+    }
+    else
+    {
+        registered_callbacks.insert(std::pair<std::string, PyObject*>(std::string(name), py_callback_info));
    }
-    last_param = Py_BuildValue("OO", on_mouse, param);
-    ERRWRAP2(setMouseCallback(name, OnMouse, last_param));
+    ERRWRAP2(setMouseCallback(name, OnMouse, py_callback_info));
    Py_RETURN_NONE;
 }
 #endif
@ -1663,13 +1670,20 @@ static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
        PyErr_SetString(PyExc_TypeError, "on_change must be callable");
        return NULL;
    }
-    static PyObject* last_param = NULL;
-    if (last_param) {
-        Py_DECREF(last_param);
-        last_param = NULL;
+    PyObject* py_callback_info = Py_BuildValue("OO", on_change, Py_None);
+    std::string name = std::string(window_name) + ":" + std::string(trackbar_name);
+    static std::map<std::string, PyObject*> registered_callbacks;
+    std::map<std::string, PyObject*>::iterator i = registered_callbacks.find(name);
+    if (i != registered_callbacks.end())
+    {
+        Py_DECREF(i->second);
+        i->second = py_callback_info;
    }
-    last_param = Py_BuildValue("OO", on_change, Py_None);
-    ERRWRAP2(createTrackbar(trackbar_name, window_name, value, count, OnChange, last_param));
+    else
+    {
+        registered_callbacks.insert(std::pair<std::string, PyObject*>(name, py_callback_info));
+    }
+    ERRWRAP2(createTrackbar(trackbar_name, window_name, value, count, OnChange, py_callback_info));
    Py_RETURN_NONE;
 }

@ -1717,13 +1731,21 @@ static PyObject *pycvCreateButton(PyObject*, PyObject *args, PyObject *kw)
        userdata = Py_None;
    }

-    static PyObject* last_param = NULL;
-    if (last_param) {
-        Py_DECREF(last_param);
-        last_param = NULL;
+    PyObject* py_callback_info = Py_BuildValue("OO", on_change, userdata);
+    std::string name(button_name);
+
+    static std::map<std::string, PyObject*> registered_callbacks;
+    std::map<std::string, PyObject*>::iterator i = registered_callbacks.find(name);
+    if (i != registered_callbacks.end())
+    {
+        Py_DECREF(i->second);
+        i->second = py_callback_info;
+    }
+    else
+    {
+        registered_callbacks.insert(std::pair<std::string, PyObject*>(name, py_callback_info));
    }
-    last_param = Py_BuildValue("OO", on_change, userdata);
-    ERRWRAP2(createButton(button_name, OnButtonChange, last_param, button_type, initial_button_state != 0));
+    ERRWRAP2(createButton(button_name, OnButtonChange, py_callback_info, button_type, initial_button_state != 0));
    Py_RETURN_NONE;
 }
 #endif
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@ -74,7 +74,6 @@ Ptr<Blender> Blender::createDefault(int type, bool try_gpu)
    if (type == MULTI_BAND)
        return makePtr<MultiBandBlender>(try_gpu);
    CV_Error(Error::StsBadArg, "unsupported blending method");
-    return Ptr<Blender>();
 }


--- a/modules/stitching/src/timelapsers.cpp
+++ b/modules/stitching/src/timelapsers.cpp
@ -53,7 +53,6 @@ Ptr<Timelapser> Timelapser::createDefault(int type)
    if (type == CROP)
        return makePtr<TimelapserCrop>();
    CV_Error(Error::StsBadArg, "unsupported timelapsing method");
-    return Ptr<Timelapser>();
 }


--- a/modules/stitching/src/warpers_cuda.cpp
+++ b/modules/stitching/src/warpers_cuda.cpp
@ -163,7 +163,6 @@ Rect cv::detail::PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArr
    (void)xmap;
    (void)ymap;
    throw_no_cuda();
-    return Rect();
 #else
    projector_.setCameraParams(K, R, T);

@ -198,7 +197,6 @@ Point cv::detail::PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, I
    (void)border_mode;
    (void)dst;
    throw_no_cuda();
-    return Point();
 #else
    Rect dst_roi = buildMaps(src.size(), K, R, T, d_xmap_, d_ymap_);
    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@ -216,7 +214,6 @@ Rect cv::detail::SphericalWarperGpu::buildMaps(Size src_size, InputArray K, Inpu
    (void)xmap;
    (void)ymap;
    throw_no_cuda();
-    return Rect();
 #else
    projector_.setCameraParams(K, R);

@ -242,7 +239,6 @@ Point cv::detail::SphericalWarperGpu::warp(const cuda::GpuMat & src, InputArray
    (void)border_mode;
    (void)dst;
    throw_no_cuda();
-    return Point();
 #else
    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@ -262,7 +258,6 @@ Rect cv::detail::CylindricalWarperGpu::buildMaps(Size src_size, InputArray K, In
    (void)xmap;
    (void)ymap;
    throw_no_cuda();
-    return Rect();
 #else
    projector_.setCameraParams(K, R);

@ -288,7 +283,6 @@ Point cv::detail::CylindricalWarperGpu::warp(const cuda::GpuMat & src, InputArra
    (void)border_mode;
    (void)dst;
    throw_no_cuda();
-    return Point();
 #else
    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
--- a/modules/superres/src/btv_l1_cuda.cpp
+++ b/modules/superres/src/btv_l1_cuda.cpp
@ -55,7 +55,6 @@ using namespace cv::superres::detail;
 Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1_CUDA()
 {
    CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<SuperResolution>();
 }

 #else // HAVE_CUDA
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@ -196,9 +196,8 @@ Ptr<FrameSource> cv::superres::createFrameSource_Camera(int deviceId)

 Ptr<FrameSource> cv::superres::createFrameSource_Video_CUDA(const String& fileName)
 {
-    (void) fileName;
+    CV_UNUSED(fileName);
    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<FrameSource>();
 }

 #else // HAVE_OPENCV_CUDACODEC
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@ -411,25 +411,21 @@ Ptr<cv::superres::DualTVL1OpticalFlow> cv::superres::createOptFlow_DualTVL1()
 Ptr<cv::superres::FarnebackOpticalFlow> cv::superres::createOptFlow_Farneback_CUDA()
 {
    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<cv::superres::FarnebackOpticalFlow>();
 }

 Ptr<cv::superres::DualTVL1OpticalFlow> cv::superres::createOptFlow_DualTVL1_CUDA()
 {
    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<cv::superres::DualTVL1OpticalFlow>();
 }

 Ptr<cv::superres::BroxOpticalFlow> cv::superres::createOptFlow_Brox_CUDA()
 {
    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<cv::superres::BroxOpticalFlow>();
 }

 Ptr<cv::superres::PyrLKOpticalFlow> cv::superres::createOptFlow_PyrLK_CUDA()
 {
    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<cv::superres::PyrLKOpticalFlow>();
 }

 #else // HAVE_OPENCV_CUDAOPTFLOW
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@ -122,6 +122,25 @@ do \
    << "Size: " << mat1.size() << std::endl; \
 } while ((void)0, 0)

+#define OCL_EXPECT_MAT_N_DIFF(name, eps) \
+do \
+{ \
+    ASSERT_EQ(name ## _roi.type(), u ## name ## _roi.type()); \
+    ASSERT_EQ(name ## _roi.size(), u ## name ## _roi.size()); \
+    Mat diff, binary, binary_8; \
+    absdiff(name ## _roi, u ## name ## _roi, diff); \
+    Mat mask(diff.size(), CV_8UC(dst.channels()), cv::Scalar::all(255)); \
+    if (mask.cols > 2 && mask.rows > 2) \
+        mask(cv::Rect(1, 1, mask.cols - 2, mask.rows - 2)).setTo(0); \
+    cv::threshold(diff, binary, (double)eps, 255, cv::THRESH_BINARY); \
+    EXPECT_LE(countNonZero(binary.reshape(1)), (int)(binary.cols*binary.rows*5/1000)) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+    binary.convertTo(binary_8, mask.type()); \
+    binary_8 = binary_8 & mask; \
+    EXPECT_LE(countNonZero(binary_8.reshape(1)), (int)((binary_8.cols+binary_8.rows)/100)) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+} while ((void)0, 0)
+
 #define OCL_EXPECT_MATS_NEAR(name, eps) \
 do \
 { \
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -116,7 +116,8 @@ enum VideoCaptureAPIs {
       CAP_IMAGES       = 2000,         //!< OpenCV Image Sequence (e.g. img_%02d.jpg)
       CAP_ARAVIS       = 2100,         //!< Aravis SDK
       CAP_OPENCV_MJPEG = 2200,         //!< Built-in OpenCV MotionJPEG codec
-       CAP_INTEL_MFX    = 2300          //!< Intel MediaSDK
+       CAP_INTEL_MFX    = 2300,         //!< Intel MediaSDK
+       CAP_XINE         = 2400,         //!< XINE engine (Linux)
     };

 /** @brief %VideoCapture generic properties identifier.
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@ -60,6 +60,13 @@
 #pragma warning(disable: 4748)
 #endif

+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+
 using namespace cv;

 namespace cv
@ -311,15 +318,11 @@ CV_IMPL CvCapture * cvCreateFileCaptureWithPreference (const char * filename, in
        if (apiPreference) break;
 #endif

-    case CAP_MSMF:
 #ifdef HAVE_MSMF
+    case CAP_MSMF:
        TRY_OPEN(result, cvCreateFileCapture_MSMF (filename))
-#endif
-
-#ifdef HAVE_XINE
-        TRY_OPEN(result, cvCreateFileCapture_XINE (filename))
-#endif
        if (apiPreference) break;
+#endif

 #ifdef HAVE_VFW
    case CAP_VFW:
@ -533,6 +536,14 @@ static Ptr<IVideoCapture> IVideoCapture_create(const String& filename, int apiPr
 {
    bool useAny = (apiPreference == CAP_ANY);
    Ptr<IVideoCapture> capture;
+#ifdef HAVE_XINE
+    if (useAny || apiPreference == CAP_XINE)
+    {
+        capture = createXINECapture(filename.c_str());
+        if (capture && capture->isOpened())
+            return capture;
+    }
+#endif
 #ifdef HAVE_GPHOTO2
    if (useAny || apiPreference == CAP_GPHOTO2)
    {
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@ -1614,7 +1614,8 @@ bool CvVideoWriter_GStreamer::open( const char * filename, int fourcc,
        caps = gst_caps_fixate(caps);
 #endif
 #else
-        CV_Assert(!"Gstreamer 0.10.29 or newer is required for grayscale input");
+        CV_Error(Error::StsError,
+                 "Gstreamer 0.10.29 or newer is required for grayscale input");
 #endif
    }

--- a/modules/videoio/src/cap_xine.cpp
+++ b/modules/videoio/src/cap_xine.cpp
--- a/modules/videoio/src/precomp.hpp
+++ b/modules/videoio/src/precomp.hpp
@ -133,8 +133,6 @@ CvCapture* cvCreateCameraCapture_Aravis( int index );
 CvCapture* cvCreateFileCapture_Images(const char* filename);
 CvVideoWriter* cvCreateVideoWriter_Images(const char* filename);

-CvCapture* cvCreateFileCapture_XINE (const char* filename);
-

 #define CV_CAP_GSTREAMER_1394		0
 #define CV_CAP_GSTREAMER_V4L		1
@ -195,6 +193,8 @@ namespace cv

    Ptr<IVideoCapture> createGPhoto2Capture(int index);
    Ptr<IVideoCapture> createGPhoto2Capture(const String& deviceName);
+
+    Ptr<IVideoCapture> createXINECapture(const char* filename);
 }

 #endif /* __VIDEOIO_H_ */
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@ -131,7 +131,7 @@ public:
            return;
        }

-        if (ext != "wmv")
+        if (ext != "wmv" && ext != "h264" && ext != "h265")
        {
            SCOPED_TRACE("progressive seek");
            ASSERT_TRUE(cap.set(CAP_PROP_POS_FRAMES, 0));
@ -141,7 +141,7 @@ public:
            }
        }

-        if (ext != "mpg" && ext != "wmv")
+        if (ext != "mpg" && ext != "wmv" && ext != "h264" && ext != "h265")
        {
            SCOPED_TRACE("random seek");
            ASSERT_TRUE(cap.set(CAP_PROP_POS_FRAMES, 0));
@ -334,6 +334,11 @@ int backend_params[] = {
 #ifdef HAVE_FFMPEG
    CAP_FFMPEG,
 #endif
+
+#ifdef HAVE_XINE
+    CAP_XINE,
+#endif
+
    CAP_OPENCV_MJPEG
    // CAP_INTEL_MFX
 };
@ -345,6 +350,8 @@ string bunny_params[] = {
    string("mp4"),
    string("mpg"),
    string("avi"),
+    string("h264"),
+    string("h265"),
 #endif
    string("mjpg.avi")
 };
--- a/modules/videostab/src/global_motion.cpp
+++ b/modules/videostab/src/global_motion.cpp
@ -546,9 +546,8 @@ Mat MotionEstimatorL1::estimate(InputArray points0, InputArray points1, bool *ok

 #ifndef HAVE_CLP

+    CV_UNUSED(ok);
    CV_Error(Error::StsError, "The library is built without Clp support");
-    if (ok) *ok = false;
-    return Mat::eye(3, 3, CV_32F);

 #else

--- a/modules/videostab/src/inpainting.cpp
+++ b/modules/videostab/src/inpainting.cpp
@ -329,12 +329,12 @@ MotionInpainter::MotionInpainter()
 {
 #ifdef HAVE_OPENCV_CUDAOPTFLOW
    setOptFlowEstimator(makePtr<DensePyrLkOptFlowEstimatorGpu>());
-#else
-    CV_Error(Error::StsNotImplemented, "Current implementation of MotionInpainter requires CUDA");
-#endif
    setFlowErrorThreshold(1e-4f);
    setDistThreshold(5.f);
    setBorderMode(BORDER_REPLICATE);
+#else
+    CV_Error(Error::StsNotImplemented, "Current implementation of MotionInpainter requires CUDA");
+#endif
 }


--- a/modules/viz/src/types.cpp
+++ b/modules/viz/src/types.cpp
@ -63,8 +63,7 @@ cv::viz::Mesh cv::viz::Mesh::load(const String& file, int type)
    switch (type) {
      case LOAD_AUTO:
      {
-        CV_Assert(!"cv::viz::Mesh::LOAD_AUTO: Not implemented yet");
-        break;
+        CV_Error(Error::StsError, "cv::viz::Mesh::LOAD_AUTO: Not implemented yet");
      }
      case LOAD_PLY:
      {
@ -83,8 +82,7 @@ cv::viz::Mesh cv::viz::Mesh::load(const String& file, int type)
        break;
      }
      default:
-        CV_Assert(!"cv::viz::Mesh::load: Unknown file type");
-        break;
+        CV_Error(Error::StsError, "cv::viz::Mesh::load: Unknown file type");
    }

    vtkSmartPointer<vtkPolyData> polydata = reader->GetOutput();
--- a/modules/viz/src/vizcore.cpp
+++ b/modules/viz/src/vizcore.cpp
@ -194,7 +194,7 @@ void cv::viz::writeCloud(const String& file, InputArray cloud, InputArray colors
        vtkOBJWriter::SafeDownCast(writer)->SetFileName(file.c_str());
    }
    else
-        CV_Assert(!"Unsupported format");
+        CV_Error(Error::StsError, "Unsupported format");

    writer->SetInputConnection(source->GetOutputPort());
    writer->Write();
@ -228,7 +228,7 @@ cv::Mat cv::viz::readCloud(const String& file, OutputArray colors, OutputArray n
        vtkSTLReader::SafeDownCast(reader)->SetFileName(file.c_str());
    }
    else
-        CV_Assert(!"Unsupported format");
+        CV_Error(Error::StsError, "Unsupported format");

    cv::Mat cloud;

@ -325,7 +325,7 @@ void cv::viz::writeTrajectory(InputArray _traj, const String& files_format, int
        return;
    }

-    CV_Assert(!"Unsupported array kind");
+    CV_Error(Error::StsError, "Unsupported array kind");
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////
--- a/modules/viz/src/vtk/vtkCloudMatSource.cpp
+++ b/modules/viz/src/vtk/vtkCloudMatSource.cpp
@ -128,7 +128,7 @@ int cv::viz::vtkCloudMatSource::SetColorCloudNormals(InputArray _cloud, InputArr
    else if (n.depth() == CV_64F && c.depth() == CV_64F)
        filterNanNormalsCopy<double, double>(n, c, total);
    else
-        CV_Assert(!"Unsupported normals/cloud type");
+        CV_Error(Error::StsError, "Unsupported normals/cloud type");

    return total;
 }
@ -155,7 +155,7 @@ int cv::viz::vtkCloudMatSource::SetColorCloudNormalsTCoords(InputArray _cloud, I
    else if (tc.depth() == CV_64F && cl.depth() == CV_64F)
        filterNanTCoordsCopy<double, double>(tc, cl, total);
    else
-        CV_Assert(!"Unsupported tcoords/cloud type");
+        CV_Error(Error::StsError, "Unsupported tcoords/cloud type");

    return total;
 }
--- a/samples/cpp/polar_transforms.cpp
+++ b/samples/cpp/polar_transforms.cpp
@ -43,33 +43,82 @@ int main( int argc, char** argv )
    moveWindow( "Log-Polar", 700,20 );
    moveWindow( "Recovered Linear-Polar", 20, 350 );
    moveWindow( "Recovered Log-Polar", 700, 350 );
-
+    int flags = INTER_LINEAR + WARP_FILL_OUTLIERS;
+    Mat src;
    for(;;)
    {
-        Mat frame;
-        capture >> frame;
+        capture >> src;

-        if( frame.empty() )
+        if(src.empty() )
            break;

-        Point2f center( (float)frame.cols / 2, (float)frame.rows / 2 );
-        double M = 70;
-
-        logPolar(frame,log_polar_img, center, M, INTER_LINEAR + WARP_FILL_OUTLIERS);
-        linearPolar(frame,lin_polar_img, center, M, INTER_LINEAR + WARP_FILL_OUTLIERS);
-
-        logPolar(log_polar_img, recovered_log_polar, center, M, WARP_INVERSE_MAP + INTER_LINEAR);
-        linearPolar(lin_polar_img, recovered_lin_polar_img, center, M, WARP_INVERSE_MAP + INTER_LINEAR + WARP_FILL_OUTLIERS);
-
-        imshow("Log-Polar", log_polar_img );
-        imshow("Linear-Polar", lin_polar_img );
+        Point2f center( (float)src.cols / 2, (float)src.rows / 2 );
+        double maxRadius = 0.7*min(center.y, center.x);
+
+#if 0 //deprecated
+        double M = frame.cols / log(maxRadius);
+        logPolar(frame, log_polar_img, center, M, flags);
+        linearPolar(frame, lin_polar_img, center, maxRadius, flags);
+
+        logPolar(log_polar_img, recovered_log_polar, center, M, flags + WARP_INVERSE_MAP);
+        linearPolar(lin_polar_img, recovered_lin_polar_img, center, maxRadius, flags + WARP_INVERSE_MAP);
+#endif
+        //! [InverseMap]
+        // direct transform
+        warpPolar(src, lin_polar_img, Size(),center, maxRadius, flags);                     // linear Polar
+        warpPolar(src, log_polar_img, Size(),center, maxRadius, flags + WARP_POLAR_LOG);    // semilog Polar
+        // inverse transform
+        warpPolar(lin_polar_img, recovered_lin_polar_img, src.size(), center, maxRadius, flags + WARP_INVERSE_MAP);
+        warpPolar(log_polar_img, recovered_log_polar, src.size(), center, maxRadius, flags + WARP_POLAR_LOG + WARP_INVERSE_MAP);
+        //! [InverseMap]
+
+        // Below is the reverse transformation for (rho, phi)->(x, y) :
+        Mat dst;
+        if (flags & WARP_POLAR_LOG)
+            dst = log_polar_img;
+        else
+            dst = lin_polar_img;
+        //get a point from the polar image
+        int rho = cvRound(dst.cols * 0.75);
+        int phi = cvRound(dst.rows / 2.0);
+
+        //! [InverseCoordinate]
+        double angleRad, magnitude;
+        double Kangle = dst.rows / CV_2PI;
+        angleRad = phi / Kangle;
+        if (flags & WARP_POLAR_LOG)
+        {
+            double Klog = dst.cols / std::log(maxRadius);
+            magnitude = std::exp(rho / Klog);
+        }
+        else
+        {
+            double Klin = dst.cols / maxRadius;
+            magnitude = rho / Klin;
+        }
+        int x = cvRound(center.x + magnitude * cos(angleRad));
+        int y = cvRound(center.y + magnitude * sin(angleRad));
+        //! [InverseCoordinate]
+        drawMarker(src, Point(x, y), Scalar(0, 255, 0));
+        drawMarker(dst, Point(rho, phi), Scalar(0, 255, 0));
+
+
+#if 0  //C version
+        CvMat src = frame;
+        CvMat dst = lin_polar_img;
+        CvMat inverse = recovered_lin_polar_img;
+        cvLinearPolar(&src, &dst, center, maxRadius, flags);
+        cvLinearPolar(&dst, &inverse, center, maxRadius,flags + WARP_INVERSE_MAP);
+#endif
+
+        imshow("Src frame", src);
+        imshow("Log-Polar", log_polar_img);
+        imshow("Linear-Polar", lin_polar_img);
        imshow("Recovered Linear-Polar", recovered_lin_polar_img );
        imshow("Recovered Log-Polar", recovered_log_polar );

        if( waitKey(10) >= 0 )
            break;
    }
-
-    waitKey(0);
    return 0;
 }
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@ -35,12 +35,14 @@ using namespace dnn;
 float confThreshold;
 std::vector<std::string> classes;

-void postprocess(Mat& frame, const Mat& out, Net& net);
+void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);

 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);

 void callback(int pos, void* userdata);

+std::vector<String> getOutputsNames(const Net& net);
+
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, keys);
@ -115,9 +117,10 @@ int main(int argc, char** argv)
            Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
            net.setInput(imInfo, "im_info");
        }
-        Mat out = net.forward();
+        std::vector<Mat> outs;
+        net.forward(outs, getOutputsNames(net));

-        postprocess(frame, out, net);
+        postprocess(frame, outs, net);

        // Put efficiency information.
        std::vector<double> layersTimes;
@ -131,18 +134,19 @@ int main(int argc, char** argv)
    return 0;
 }

-void postprocess(Mat& frame, const Mat& out, Net& net)
+void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
 {
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;

-    float* data = (float*)out.data;
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
-        for (size_t i = 0; i < out.total(); i += 7)
+        CV_Assert(outs.size() == 1);
+        float* data = (float*)outs[0].data;
+        for (size_t i = 0; i < outs[0].total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
@ -161,7 +165,9 @@ void postprocess(Mat& frame, const Mat& out, Net& net)
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
-        for (size_t i = 0; i < out.total(); i += 7)
+        CV_Assert(outs.size() == 1);
+        float* data = (float*)outs[0].data;
+        for (size_t i = 0; i < outs[0].total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
@ -177,27 +183,45 @@ void postprocess(Mat& frame, const Mat& out, Net& net)
    }
    else if (outLayerType == "Region")
    {
-        // Network produces output blob with a shape NxC where N is a number of
-        // detected objects and C is a number of classes + 4 where the first 4
-        // numbers are [center_x, center_y, width, height]
-        for (int i = 0; i < out.rows; ++i, data += out.cols)
+        std::vector<int> classIds;
+        std::vector<float> confidences;
+        std::vector<Rect> boxes;
+        for (size_t i = 0; i < outs.size(); ++i)
        {
-            Mat confidences = out.row(i).colRange(5, out.cols);
-            Point classIdPoint;
-            double confidence;
-            minMaxLoc(confidences, 0, &confidence, 0, &classIdPoint);
-            if (confidence > confThreshold)
+            // Network produces output blob with a shape NxC where N is a number of
+            // detected objects and C is a number of classes + 4 where the first 4
+            // numbers are [center_x, center_y, width, height]
+            float* data = (float*)outs[i].data;
+            for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
            {
-                int classId = classIdPoint.x;
-                int centerX = (int)(data[0] * frame.cols);
-                int centerY = (int)(data[1] * frame.rows);
-                int width = (int)(data[2] * frame.cols);
-                int height = (int)(data[3] * frame.rows);
-                int left = centerX - width / 2;
-                int top = centerY - height / 2;
-                drawPred(classId, (float)confidence, left, top, left + width, top + height, frame);
+                Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
+                Point classIdPoint;
+                double confidence;
+                minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
+                if (confidence > confThreshold)
+                {
+                    int centerX = (int)(data[0] * frame.cols);
+                    int centerY = (int)(data[1] * frame.rows);
+                    int width = (int)(data[2] * frame.cols);
+                    int height = (int)(data[3] * frame.rows);
+                    int left = centerX - width / 2;
+                    int top = centerY - height / 2;
+
+                    classIds.push_back(classIdPoint.x);
+                    confidences.push_back((float)confidence);
+                    boxes.push_back(Rect(left, top, width, height));
+                }
            }
        }
+        std::vector<int> indices;
+        NMSBoxes(boxes, confidences, confThreshold, 0.4, indices);
+        for (size_t i = 0; i < indices.size(); ++i)
+        {
+            int idx = indices[i];
+            Rect box = boxes[idx];
+            drawPred(classIds[idx], confidences[idx], box.x, box.y,
+                     box.x + box.width, box.y + box.height, frame);
+        }
    }
    else
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
@ -227,3 +251,17 @@ void callback(int pos, void*)
 {
    confThreshold = pos * 0.01f;
 }
+
+std::vector<String> getOutputsNames(const Net& net)
+{
+    static std::vector<String> names;
+    if (names.empty())
+    {
+        std::vector<int> outLayers = net.getUnconnectedOutLayers();
+        std::vector<String> layersNames = net.getLayerNames();
+        names.resize(outLayers.size());
+        for (size_t i = 0; i < outLayers.size(); ++i)
+            names[i] = layersNames[outLayers[i] - 1];
+    }
+    return names;
+}
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -55,7 +55,11 @@ net.setPreferableTarget(args.target)

 confThreshold = args.thr

-def postprocess(frame, out):
+def getOutputsNames(net):
+    layersNames = net.getLayerNames()
+    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
+
+def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

@ -63,7 +67,7 @@ def postprocess(frame, out):
        # Draw a bounding box.
        cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))

-        label = '%.2f' % confidence
+        label = '%.2f' % conf

        # Print a label of class.
        if classes:
@ -83,6 +87,8 @@ def postprocess(frame, out):
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
+        assert(len(outs) == 1)
+        out = outs[0]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
@ -96,6 +102,8 @@ def postprocess(frame, out):
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
+        assert(len(outs) == 1)
+        out = outs[0]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
@ -109,18 +117,33 @@ def postprocess(frame, out):
        # Network produces output blob with a shape NxC where N is a number of
        # detected objects and C is a number of classes + 4 where the first 4
        # numbers are [center_x, center_y, width, height]
-        for detection in out:
-            confidences = detection[5:]
-            classId = np.argmax(confidences)
-            confidence = confidences[classId]
-            if confidence > confThreshold:
-                center_x = int(detection[0] * frameWidth)
-                center_y = int(detection[1] * frameHeight)
-                width = int(detection[2] * frameWidth)
-                height = int(detection[3] * frameHeight)
-                left = center_x - width / 2
-                top = center_y - height / 2
-                drawPred(classId, confidence, left, top, left + width, top + height)
+        classIds = []
+        confidences = []
+        boxes = []
+        for out in outs:
+            for detection in out:
+                scores = detection[5:]
+                classId = np.argmax(scores)
+                confidence = scores[classId]
+                if confidence > confThreshold:
+                    center_x = int(detection[0] * frameWidth)
+                    center_y = int(detection[1] * frameHeight)
+                    width = int(detection[2] * frameWidth)
+                    height = int(detection[3] * frameHeight)
+                    left = center_x - width / 2
+                    top = center_y - height / 2
+                    classIds.append(classId)
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
+        indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, 0.4)
+        for i in indices:
+            i = i[0]
+            box = boxes[i]
+            left = box[0]
+            top = box[1]
+            width = box[2]
+            height = box[3]
+            drawPred(classIds[i], confidences[i], left, top, left + width, top + height)

 # Process inputs
 winName = 'Deep learning object detection in OpenCV'
@ -152,9 +175,9 @@ while cv.waitKey(1) < 0:
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        frame = cv.resize(frame, (inpWidth, inpHeight))
        net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info');
-    out = net.forward()
+    outs = net.forward(getOutputsNames(net))

-    postprocess(frame, out)
+    postprocess(frame, outs)

    # Put efficiency information.
    t, _ = net.getPerfProfile()