diff --git a/3rdparty/openvx/hal/openvx_hal.cpp b/3rdparty/openvx/hal/openvx_hal.cpp
index 2999f929c5..df2d6788ef 100644
--- a/3rdparty/openvx/hal/openvx_hal.cpp
+++ b/3rdparty/openvx/hal/openvx_hal.cpp
@@ -11,6 +11,7 @@
 #include <cfloat>
 #include <climits>
 #include <cmath>
+#include <cstring>
 
 //==================================================================================================
 // utility
@@ -600,7 +601,7 @@ int ovx_hal_sepFilterInit(cvhalFilter2D **filter_context, int src_type, int dst_
 {
     if (!filter_context || !kernelx_data || !kernely_data || delta != 0 ||
         src_type != CV_8UC1 || (dst_type != CV_8UC1 && dst_type != CV_16SC1) ||
-        kernelx_length % 2 == 0 || kernely_length % 2 == 0 || anchor_x != kernelx_length / 2 || anchor_y != kernely_length / 2)
+        kernelx_length != 3 || kernely_length != 3 || anchor_x != 1 || anchor_y != 1)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
     ivx::border_t border;
@@ -1076,7 +1077,7 @@ int ovx_hal_integral(int depth, int sdepth, int, const uchar * a, size_t astep,
             ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U32,
                 ivx::Image::createAddressing(w, h, 4, (vx_int32)bstep), (unsigned int *)(b + bstep + sizeof(unsigned int)));
         ivx::IVX_CHECK_STATUS(vxuIntegralImage(ctx, ia, ib));
-        memset(b, 0, (w + 1) * sizeof(unsigned int));
+        std::memset(b, 0, (w + 1) * sizeof(unsigned int));
         b += bstep;
         for (int i = 0; i < h; i++, b += bstep)
         {
diff --git a/3rdparty/openvx/include/ivx.hpp b/3rdparty/openvx/include/ivx.hpp
index 47213e6d5c..5c3029d617 100644
--- a/3rdparty/openvx/include/ivx.hpp
+++ b/3rdparty/openvx/include/ivx.hpp
@@ -32,6 +32,12 @@ static const vx_enum VX_INTERPOLATION_NEAREST_NEIGHBOR = VX_INTERPOLATION_TYPE_N
 static const vx_enum VX_BORDER_CONSTANT = VX_BORDER_MODE_CONSTANT;
 static const vx_enum VX_BORDER_REPLICATE = VX_BORDER_MODE_REPLICATE;
 
+#else
+
+    #ifdef IVX_RENAMED_REFS
+        static const vx_enum VX_REF_ATTRIBUTE_TYPE = VX_REFERENCE_TYPE;
+    #endif
+
 #endif
 
 #ifndef IVX_USE_CXX98
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83d567685e..7d044583bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,6 +91,14 @@ if(POLICY CMP0042)
   cmake_policy(SET CMP0042 NEW)
 endif()
 
+if(POLICY CMP0051)
+  cmake_policy(SET CMP0051 NEW)
+endif()
+
+if(POLICY CMP0056)
+  cmake_policy(SET CMP0056 NEW)
+endif()
+
 include(cmake/OpenCVUtils.cmake)
 
 # must go before the project command
@@ -280,16 +288,6 @@ OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_POPCNT              "Enable POPCNT instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_AVX2                "Enable AVX2 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_FMA3                "Enable FMA3 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 "${NEON}" IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
@@ -299,6 +297,9 @@ OCV_OPTION(ENABLE_IMPL_COLLECTION     "Collect implementation data on function c
 OCV_OPTION(ENABLE_INSTRUMENTATION     "Instrument functions to collect calls trace and performance" OFF )
 OCV_OPTION(ENABLE_GNU_STL_DEBUG       "Enable GNU STL Debug mode (defines _GLIBCXX_DEBUG)"       OFF IF ((NOT CMAKE_VERSION VERSION_LESS "2.8.11") AND CMAKE_COMPILER_IS_GNUCXX) )
 OCV_OPTION(GENERATE_ABI_DESCRIPTOR    "Generate XML file for abi_compliance_checker tool" OFF IF UNIX)
+OCV_OPTION(CV_ENABLE_INTRINSICS       "Use intrinsic-based optimized code" ON )
+OCV_OPTION(CV_DISABLE_OPTIMIZATION    "Disable explicit optimized code (dispatched code/intrinsics/loop unrolling/etc)" OFF )
+
 
 OCV_OPTION(DOWNLOAD_EXTERNAL_TEST_DATA "Download external test data (Python executable and OPENCV_TEST_DATA_PATH environment variable may be required)" OFF )
 
@@ -499,6 +500,9 @@ if(CMAKE_GENERATOR MATCHES "Makefiles|Ninja" AND "${CMAKE_BUILD_TYPE}" STREQUAL
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+# --- Python Support ---
+include(cmake/OpenCVDetectPython.cmake)
+
 include(cmake/OpenCVCompilerOptions.cmake)
 
 
@@ -578,9 +582,6 @@ else()
   unset(DOXYGEN_FOUND CACHE)
 endif()
 
-# --- Python Support ---
-include(cmake/OpenCVDetectPython.cmake)
-
 # --- Java Support ---
 include(cmake/OpenCVDetectApacheAnt.cmake)
 if(ANDROID)
@@ -869,6 +870,33 @@ if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
   status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
 
+
+# ========================= CPU code generation mode =========================
+status("")
+status("  CPU/HW features:")
+status("    Baseline:"  "${CPU_BASELINE_FINAL}")
+if(NOT CPU_BASELINE STREQUAL CPU_BASELINE_FINAL)
+  status("      requested:"  "${CPU_BASELINE}")
+endif()
+if(CPU_BASELINE_REQUIRE)
+  status("      required:"  "${CPU_BASELINE_REQUIRE}")
+endif()
+if(CPU_BASELINE_DISABLE)
+  status("      disabled:"  "${CPU_BASELINE_DISABLE}")
+endif()
+if(CPU_DISPATCH_FINAL OR CPU_DISPATCH)
+  status("    Dispatched code generation:"  "${CPU_DISPATCH_FINAL}")
+  if(NOT CPU_DISPATCH STREQUAL CPU_DISPATCH_FINAL)
+    status("      requested:"  "${CPU_DISPATCH}")
+  endif()
+  if(CPU_DISPATCH_REQUIRE)
+    status("      required:"  "${CPU_DISPATCH_REQUIRE}")
+  endif()
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    status("      ${OPT} (${CPU_${OPT}_USAGE_COUNT} files):"  "+ ${CPU_DISPATCH_${OPT}_INCLUDED}")
+  endforeach()
+endif()
+
 # ========================== C/C++ options ==========================
 if(CMAKE_CXX_COMPILER_VERSION)
   set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_CXX_COMPILER_VERSION})")
diff --git a/apps/interactive-calibration/CMakeLists.txt b/apps/interactive-calibration/CMakeLists.txt
index 735c4538ef..41abf7c908 100644
--- a/apps/interactive-calibration/CMakeLists.txt
+++ b/apps/interactive-calibration/CMakeLists.txt
@@ -1,4 +1,7 @@
-set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_aruco opencv_highgui opencv_calib3d opencv_videoio)
+set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio)
+if(${BUILD_opencv_aruco})
+    list(APPEND OPENCV_INTERACTIVECALIBRATION_DEPS opencv_aruco)
+endif()
 ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS})
 
 if(NOT OCV_DEPENDENCIES_FOUND)
diff --git a/apps/interactive-calibration/frameProcessor.cpp b/apps/interactive-calibration/frameProcessor.cpp
index 1e672b0c45..07b32dbe12 100644
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@@ -7,7 +7,6 @@
 
 #include <opencv2/calib3d.hpp>
 #include <opencv2/imgproc.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/highgui.hpp>
 
 #include <vector>
@@ -75,6 +74,7 @@ bool CalibProcessor::detectAndParseChessboard(const cv::Mat &frame)
 
 bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
 {
+#ifdef HAVE_OPENCV_ARUCO
     cv::Ptr<cv::aruco::Board> board = mCharucoBoard.staticCast<cv::aruco::Board>();
 
     std::vector<std::vector<cv::Point2f> > corners, rejected;
@@ -95,14 +95,16 @@ bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
         }
         centerX /= currentCharucoCorners.size[0];
         centerY /= currentCharucoCorners.size[0];
-        //cv::circle(frame, cv::Point2f(centerX, centerY), 10, cv::Scalar(0, 255, 0), 10);
+
         mTemplateLocations.insert(mTemplateLocations.begin(), cv::Point2f(centerX, centerY));
         cv::aruco::drawDetectedCornersCharuco(frame, currentCharucoCorners, currentCharucoIds);
         mCurrentCharucoCorners = currentCharucoCorners;
         mCurrentCharucoIds = currentCharucoIds;
         return true;
     }
-
+#else
+    (void)frame;
+#endif
     return false;
 }
 
@@ -231,6 +233,7 @@ bool CalibProcessor::checkLastFrame()
         }
     }
     else {
+#ifdef HAVE_OPENCV_ARUCO
         cv::Mat r, t, angles;
         std::vector<cv::Point3f> allObjPoints;
         allObjPoints.reserve(mCurrentCharucoIds.total());
@@ -248,6 +251,7 @@ bool CalibProcessor::checkLastFrame()
             mCalibData->allCharucoCorners.pop_back();
             mCalibData->allCharucoIds.pop_back();
         }
+#endif
     }
     return isFrameBad;
 }
@@ -266,10 +270,12 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
     switch(mBoardType)
     {
     case chAruco:
+#ifdef HAVE_OPENCV_ARUCO
         mArucoDictionary = cv::aruco::getPredefinedDictionary(
                     cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
         mCharucoBoard = cv::aruco::CharucoBoard::create(mBoardSize.width, mBoardSize.height, capParams.charucoSquareLenght,
                                                         capParams.charucoMarkerSize, mArucoDictionary);
+#endif
         break;
     case AcirclesGrid:
         mBlobDetectorPtr = cv::SimpleBlobDetector::create();
diff --git a/apps/interactive-calibration/frameProcessor.hpp b/apps/interactive-calibration/frameProcessor.hpp
index 4dbb8ab314..222b83143f 100644
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@@ -6,8 +6,10 @@
 #define FRAME_PROCESSOR_HPP
 
 #include <opencv2/core.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/calib3d.hpp>
+#ifdef HAVE_OPENCV_ARUCO
+#include <opencv2/aruco/charuco.hpp>
+#endif
 
 #include "calibCommon.hpp"
 #include "calibController.hpp"
@@ -37,8 +39,10 @@ protected:
     cv::Mat mCurrentCharucoIds;
 
     cv::Ptr<cv::SimpleBlobDetector> mBlobDetectorPtr;
+#ifdef HAVE_OPENCV_ARUCO
     cv::Ptr<cv::aruco::Dictionary> mArucoDictionary;
     cv::Ptr<cv::aruco::CharucoBoard> mCharucoBoard;
+#endif
 
     int mNeededFramesNum;
     unsigned mDelayBetweenCaptures;
diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index af62d4ea40..10d3690b89 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -4,10 +4,13 @@
 
 #include <opencv2/core.hpp>
 #include <opencv2/calib3d.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/cvconfig.h>
 #include <opencv2/highgui.hpp>
 
+#ifdef HAVE_OPENCV_ARUCO
+#include <opencv2/aruco/charuco.hpp>
+#endif
+
 #include <string>
 #include <vector>
 #include <stdexcept>
@@ -50,31 +53,27 @@ bool calib::showOverlayMessage(const std::string& message)
 #endif
 }
 
-static void deleteButton(int state, void* data)
+static void deleteButton(int, void* data)
 {
-    state++; //to avoid gcc warnings
     (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteLastFrame();
     calib::showOverlayMessage("Last frame deleted");
 }
 
-static void deleteAllButton(int state, void* data)
+static void deleteAllButton(int, void* data)
 {
-    state++;
     (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteAllData();
     calib::showOverlayMessage("All frames deleted");
 }
 
-static void saveCurrentParamsButton(int state, void* data)
+static void saveCurrentParamsButton(int, void* data)
 {
-    state++;
     if((static_cast<cv::Ptr<calibDataController>*>(data))->get()->saveCurrentCameraParameters())
         calib::showOverlayMessage("Calibration parameters saved");
 }
 
 #ifdef HAVE_QT
-static void switchVisualizationModeButton(int state, void* data)
+static void switchVisualizationModeButton(int, void* data)
 {
-    state++;
     ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
     processor->switchVisualizationMode();
 }
@@ -103,6 +102,11 @@ int main(int argc, char** argv)
 
     captureParameters capParams = paramsController.getCaptureParameters();
     internalParameters intParams = paramsController.getInternalParameters();
+#ifndef HAVE_OPENCV_ARUCO
+    if(capParams.board == chAruco)
+        CV_Error(cv::Error::StsNotImplemented, "Aruco module is disabled in current build configuration."
+                                               " Consider usage of another calibration pattern\n");
+#endif
 
     cv::TermCriteria solverTermCrit = cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS,
                                                        intParams.solverMaxIters, intParams.solverEps);
@@ -172,6 +176,7 @@ int main(int argc, char** argv)
                                                     calibrationFlags, solverTermCrit);
                 }
                 else {
+#ifdef HAVE_OPENCV_ARUCO
                     cv::Ptr<cv::aruco::Dictionary> dictionary =
                             cv::aruco::getPredefinedDictionary(cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
                     cv::Ptr<cv::aruco::CharucoBoard> charucoboard =
@@ -183,6 +188,7 @@ int main(int argc, char** argv)
                                                            globalData->cameraMatrix, globalData->distCoeffs,
                                                            cv::noArray(), cv::noArray(), globalData->stdDeviations, cv::noArray(),
                                                            globalData->perViewErrors, calibrationFlags, solverTermCrit);
+#endif
                 }
                 dataController->updateUndistortMap();
                 dataController->printParametersToConsole(std::cout);
diff --git a/cmake/FindOpenVX.cmake b/cmake/FindOpenVX.cmake
index 0a55e951d8..6cba52717c 100644
--- a/cmake/FindOpenVX.cmake
+++ b/cmake/FindOpenVX.cmake
@@ -25,6 +25,20 @@ endif()
 
 if(OPENVX_INCLUDE_DIR AND OPENVX_LIBRARIES)
   set(HAVE_OPENVX TRUE)
+
+  try_compile(OPENVX_RENAMED_REF
+      "${OpenCV_BINARY_DIR}"
+      "${OpenCV_SOURCE_DIR}/cmake/checks/openvx_refenum_test.cpp"
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${OPENVX_INCLUDE_DIR}"
+      LINK_LIBRARIES ${OPENVX_LIBRARIES}
+      OUTPUT_VARIABLE OUTPUT
+  )
+  if(OPENVX_RENAMED_REF)
+      add_definitions(-DIVX_RENAMED_REFS=1)
+      message(STATUS "OpenVX: Checking reference attribute name convention... New")
+  else()
+      message(STATUS "OpenVX: Checking reference attribute name convention... Old")
+  endif()
 endif()
 
 if(NOT HAVE_OPENVX)
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
new file mode 100644
index 0000000000..b849f02b14
--- /dev/null
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -0,0 +1,651 @@
+# x86/x86-64 arch:
+# SSE / SSE2 (always available on 64-bit CPUs)
+# SSE3 / SSSE3
+# SSE4_1 / SSE4_2 / POPCNT
+# AVX / AVX2 / AVX512
+# FMA3
+
+# CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag)
+# CPU_{opt}_IMPLIES=<list>
+# CPU_{opt}_FORCE=<list> - subset of "implies" list
+# CPU_{opt}_FLAGS_ON=""
+# CPU_{opt}_FEATURE_ALIAS - mapping to CV_CPU_* HWFeature enum
+
+# Input variables:
+# CPU_BASELINE=<list> - preferred list of baseline optimizations
+# CPU_DISPATCH=<list> - preferred list of dispatched optimizations
+
+# Advanced input variables:
+# CPU_BASELINE_REQUIRE=<list> - list of required baseline optimizations
+# CPU_DISPATCH_REQUIRE=<list> - list of required dispatched optimizations
+# CPU_BASELINE_DISABLE=<list> - list of disabled baseline optimizations
+
+# Output variables:
+# CPU_BASELINE_FINAL=<list> - final list of enabled compiler optimizations
+# CPU_DISPATCH_FINAL=<list> - final list of dispatched optimizations
+#
+# CPU_DISPATCH_FLAGS_${opt} - flags for source files compiled separately (_opt_avx2.cpp)
+
+set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3") # without AVX512
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
+
+ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
+
+
+set(HELP_CPU_BASELINE "Specify list of enabled baseline CPU optimizations")
+set(HELP_CPU_BASELINE_REQUIRE "Specify list of required baseline CPU optimizations")
+set(HELP_CPU_BASELINE_DISABLE "Specify list of forbidden baseline CPU optimizations")
+set(HELP_CPU_DISPATCH "Specify list of dispatched CPU optimizations")
+set(HELP_CPU_DISPATCH_REQUIRE "Specify list of required dispatched CPU optimizations")
+
+foreach(var CPU_BASELINE CPU_BASELINE_REQUIRE CPU_BASELINE_DISABLE CPU_DISPATCH CPU_DISPATCH_REQUIRE)
+  if(DEFINED ${var})
+    string(REPLACE "," ";" _list "${${var}}")
+    set(${var} "${_list}" CACHE STRING "${HELP_${var}}" FORCE)
+  endif()
+endforeach()
+
+# process legacy flags
+macro(ocv_optimization_process_obsolete_option legacy_flag OPT legacy_warn)
+  if(DEFINED ${legacy_flag})
+    if(${legacy_warn})
+      message(STATUS "WARNING: Option ${legacy_flag}='${${legacy_flag}}' is deprecated and should not be used anymore")
+      message(STATUS "         Behaviour of this option is not backward compatible")
+      message(STATUS "         Refer to 'CPU_BASELINE'/'CPU_DISPATCH' CMake options documentation")
+    endif()
+    if(${legacy_flag})
+      if(NOT ";${CPU_BASELINE_REQUIRE};" MATCHES ";${OPT};")
+        set(CPU_BASELINE_REQUIRE "${CPU_BASELINE_REQUIRE};${OPT}" CACHE STRING "${HELP_CPU_BASELINE_REQUIRE}" FORCE)
+      endif()
+    else()
+      if(NOT ";${CPU_BASELINE_DISABLE};" MATCHES ";${OPT};")
+        set(CPU_BASELINE_DISABLE "${CPU_BASELINE_DISABLE};${OPT}" CACHE STRING "${HELP_CPU_BASELINE_DISABLE}" FORCE)
+      endif()
+    endif()
+  endif()
+endmacro()
+ocv_optimization_process_obsolete_option(ENABLE_SSE SSE ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE2 SSE2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE3 SSE3 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSSE3 SSSE3 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE41 SSE4_1 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE42 SSE4_2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_POPCNT POPCNT ON)
+ocv_optimization_process_obsolete_option(ENABLE_AVX AVX ON)
+ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
+
+ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
+ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)
+
+
+macro(ocv_is_optimization_in_list resultvar check_opt)
+  set(__checked "")
+  set(__queue ${ARGN})
+  set(${resultvar} 0)
+  while(__queue AND NOT ${resultvar})
+    list(REMOVE_DUPLICATES __queue)
+    set(__queue_current ${__queue})
+    set(__queue "")
+    foreach(OPT ${__queue_current})
+      if("x${OPT}" STREQUAL "x${check_opt}")
+        set(${resultvar} 1)
+        break()
+      elseif(NOT ";${__checked};" MATCHES ";${OPT};")
+        list(APPEND __queue ${CPU_${OPT}_IMPLIES})
+      endif()
+      list(APPEND __checked ${OPT})
+    endforeach()
+  endwhile()
+endmacro()
+
+macro(ocv_is_optimization_in_force_list resultvar check_opt)
+  set(__checked "")
+  set(__queue ${ARGN})
+  set(${resultvar} 0)
+  while(__queue AND NOT ${resultvar})
+    list(REMOVE_DUPLICATES __queue)
+    set(__queue_current ${__queue})
+    set(__queue "")
+    foreach(OPT ${__queue_current})
+      if(OPT STREQUAL "${check_opt}")
+        set(${resultvar} 1)
+        break()
+      elseif(NOT ";${__checked};" MATCHES ";${OPT};")
+        list(APPEND __queue ${CPU_${OPT}_FORCE})
+      endif()
+      list(APPEND __checked ${OPT})
+    endforeach()
+  endwhile()
+endmacro()
+
+macro(ocv_append_optimization_flag var OPT)
+  if(CPU_${OPT}_FLAGS_CONFLICT)
+    string(REGEX REPLACE " ${CPU_${OPT}_FLAGS_CONFLICT}" "" ${var} " ${${var}}")
+    string(REGEX REPLACE "^ +" "" ${var} "${${var}}")
+  endif()
+  set(${var} "${${var}} ${CPU_${OPT}_FLAGS_ON}")
+endmacro()
+
+# Support GCC -march=native or Intel Compiler -xHost flags
+if(";${CPU_BASELINE};" MATCHES ";NATIVE;" OR ";${CPU_BASELINE};" MATCHES ";HOST;")
+  set(CPU_BASELINE_DETECT ON)
+  set(_add_native_flag ON)
+elseif(";${CPU_BASELINE};" MATCHES ";DETECT;")
+  set(CPU_BASELINE_DETECT ON)
+elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
+  if(DEFINED CPU_BASELINE)
+    message(STATUS "CPU: Detected '-march=native' or '-xHost' compiler flag. Force CPU_BASELINE=DETECT.")
+  endif()
+  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_BASELINE_DETECT ON)
+endif()
+
+if(X86 OR X86_64)
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX512")
+
+  ocv_update(CPU_SSE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
+  ocv_update(CPU_SSE2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
+  ocv_update(CPU_SSE3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp")
+  ocv_update(CPU_SSSE3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp")
+  ocv_update(CPU_SSE4_1_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp")
+  ocv_update(CPU_SSE4_2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp")
+  ocv_update(CPU_POPCNT_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_popcnt.cpp")
+  ocv_update(CPU_AVX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
+  ocv_update(CPU_AVX2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
+  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  ocv_update(CPU_AVX512_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp")
+
+  if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE)
+    ocv_update(CPU_AVX512_IMPLIES "AVX2")
+    ocv_update(CPU_AVX512_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_AVX2_IMPLIES "AVX;FMA3;FP16")
+    ocv_update(CPU_FMA3_IMPLIES "AVX2")
+    ocv_update(CPU_FMA3_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_FP16_IMPLIES "AVX")
+    ocv_update(CPU_FP16_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_AVX_IMPLIES "SSE4_2")
+    ocv_update(CPU_SSE4_2_IMPLIES "SSE4_1;POPCNT")
+    ocv_update(CPU_POPCNT_IMPLIES "SSE4_1")
+    ocv_update(CPU_POPCNT_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_SSE4_1_IMPLIES "SSE3;SSSE3")
+    ocv_update(CPU_SSSE3_IMPLIES "SSE3")
+    ocv_update(CPU_SSE3_IMPLIES "SSE2")
+    ocv_update(CPU_SSE2_IMPLIES "SSE")
+  endif()
+
+  if(CV_ICC)
+    macro(ocv_intel_compiler_optimization_option name unix_flags msvc_flags)
+      ocv_update(CPU_${name}_FLAGS_NAME "${name}")
+      if(MSVC)
+        set(enable_flags "${msvc_flags}")
+        set(flags_conflict "/arch:[^ ]+")
+      else()
+        set(enable_flags "${unix_flags}")
+        set(flags_conflict "-msse[^ ]*|-mssse3|-mavx[^ ]*|-march[^ ]+")
+      endif()
+      ocv_update(CPU_${name}_FLAGS_ON "${enable_flags}")
+      if(flags_conflict)
+        ocv_update(CPU_${name}_FLAGS_CONFLICT "${flags_conflict}")
+      endif()
+    endmacro()
+    ocv_intel_compiler_optimization_option(AVX2 "-march=core-avx2" "/arch:CORE-AVX2")
+    ocv_intel_compiler_optimization_option(FP16 "-mavx" "/arch:AVX")
+    ocv_intel_compiler_optimization_option(AVX "-mavx" "/arch:AVX")
+    ocv_intel_compiler_optimization_option(FMA3 "" "")
+    ocv_intel_compiler_optimization_option(POPCNT "" "")
+    ocv_intel_compiler_optimization_option(SSE4_2 "-msse4.2" "/arch:SSE4.2")
+    ocv_intel_compiler_optimization_option(SSE4_1 "-msse4.1" "/arch:SSE4.1")
+    ocv_intel_compiler_optimization_option(SSE3 "-msse3" "/arch:SSE3")
+    ocv_intel_compiler_optimization_option(SSSE3 "-mssse3" "/arch:SSSE3")
+    ocv_intel_compiler_optimization_option(SSE2 "-msse2" "/arch:SSE2")
+    if(NOT X86_64) # x64 compiler doesn't support /arch:sse
+      ocv_intel_compiler_optimization_option(SSE "-msse" "/arch:SSE")
+    endif()
+    #ocv_intel_compiler_optimization_option(AVX512   "-march=core-avx512")
+  elseif(CMAKE_COMPILER_IS_GNUCXX)
+    ocv_update(CPU_AVX2_FLAGS_ON "-mavx2")
+    ocv_update(CPU_FP16_FLAGS_ON "-mf16c")
+    ocv_update(CPU_AVX_FLAGS_ON "-mavx")
+    ocv_update(CPU_FMA3_FLAGS_ON "-mfma")
+    ocv_update(CPU_POPCNT_FLAGS_ON "-mpopcnt")
+    ocv_update(CPU_SSE4_2_FLAGS_ON "-msse4.2")
+    ocv_update(CPU_SSE4_1_FLAGS_ON "-msse4.1")
+    ocv_update(CPU_SSE3_FLAGS_ON "-msse3")
+    ocv_update(CPU_SSSE3_FLAGS_ON "-mssse3")
+    ocv_update(CPU_SSE2_FLAGS_ON "-msse2")
+    ocv_update(CPU_SSE_FLAGS_ON "-msse")
+    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0")
+      ocv_update(CPU_AVX512_FLAGS_ON "-mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi")
+    endif()
+  elseif(MSVC)
+    ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2")
+    ocv_update(CPU_AVX_FLAGS_ON "/arch:AVX")
+    if(NOT MSVC64)
+      # 64-bit MSVC compiler uses SSE/SSE2 by default
+      ocv_update(CPU_SSE_FLAGS_ON "/arch:SSE")
+      ocv_update(CPU_SSE_SUPPORTED ON)
+      ocv_update(CPU_SSE2_FLAGS_ON "/arch:SSE2")
+      ocv_update(CPU_SSE2_SUPPORTED ON)
+    else()
+      ocv_update(CPU_SSE_SUPPORTED ON)
+      ocv_update(CPU_SSE2_SUPPORTED ON)
+    endif()
+    # Other instruction sets are supported by default since MSVC 2008 at least
+  else()
+    message(WARNING "TODO: Unsupported compiler")
+  endif()
+
+  if(NOT DEFINED CPU_DISPATCH)
+    set(CPU_DISPATCH "SSE4_1;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}")
+  endif()
+
+  if(NOT DEFINED CPU_BASELINE)
+    if(X86_64)
+      set(CPU_BASELINE "SSSE3" CACHE STRING "${HELP_CPU_BASELINE}")
+    else()
+      set(CPU_BASELINE "SSE2" CACHE STRING "${HELP_CPU_BASELINE}")
+    endif()
+  endif()
+
+elseif(ARM OR AARCH64)
+  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  if(NOT AARCH64)
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
+    ocv_update(CPU_NEON_FLAGS_ON "-mfpu=neon")
+    ocv_update(CPU_VFPV3_FLAGS_ON "-mfpu=vfpv3")
+    ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16")
+    set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  else()
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
+    ocv_update(CPU_NEON_FLAGS_ON "")
+    set(CPU_BASELINE "NEON" CACHE STRING "${HELP_CPU_BASELINE}")
+  endif()
+endif()
+
+# Helper values for cmake-gui
+set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}")
+set_property(CACHE CPU_BASELINE PROPERTY STRINGS "" ${CPU_KNOWN_OPTIMIZATIONS})
+set_property(CACHE CPU_DISPATCH PROPERTY STRINGS "" ${CPU_KNOWN_OPTIMIZATIONS})
+
+set(CPU_BASELINE_FLAGS "")
+
+set(CPU_BASELINE_FINAL "")
+set(CPU_DISPATCH_FINAL "")
+
+macro(ocv_check_compiler_optimization OPT)
+  if(NOT DEFINED CPU_${OPT}_SUPPORTED)
+    if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE)
+      set(_varname "")
+      if(CPU_${OPT}_TEST_FILE)
+        set(__available 0)
+        if(CPU_BASELINE_DETECT)
+          set(_varname "HAVE_CPU_${OPT}_SUPPORT")
+          ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          if(${_varname})
+            list(APPEND CPU_BASELINE_FINAL ${OPT})
+            set(__available 1)
+          endif()
+        endif()
+        if(NOT __available)
+          if(NOT "x${CPU_${OPT}_FLAGS_NAME}" STREQUAL "x")
+            set(_varname "HAVE_CPU_${CPU_${OPT}_FLAGS_NAME}")
+            set(_compile_flags "${CPU_BASELINE_FLAGS}")
+            ocv_append_optimization_flag(_compile_flags ${OPT})
+            ocv_check_compiler_flag(CXX "${_compile_flags}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          elseif(NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x")
+            ocv_check_flag_support(CXX "${CPU_${OPT}_FLAGS_ON}" _varname "" "${CPU_${OPT}_TEST_FILE}")
+          else()
+            set(_varname "HAVE_CPU_${OPT}_SUPPORT")
+            set(_compile_flags "${CPU_BASELINE_FLAGS}")
+            ocv_append_optimization_flag(_compile_flags ${OPT})
+            ocv_check_compiler_flag(CXX "${_compile_flags}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          endif()
+        endif()
+      else()
+        ocv_check_flag_support(CXX "${CPU_${OPT}_FLAGS_ON}" _varname "")
+      endif()
+      if(_varname AND ${_varname})
+        set(CPU_${OPT}_SUPPORTED ON)
+      elseif(NOT CPU_${OPT}_SUPPORTED)
+        message(STATUS "${OPT} is not supported by C++ compiler")
+      endif()
+    else()
+      set(CPU_${OPT}_SUPPORTED ON)
+    endif()
+  endif()
+endmacro()
+
+foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
+  set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "" FORCE)
+  if(NOT DEFINED CPU_${OPT}_FORCE)
+    set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}")
+  endif()
+endforeach()
+
+if(_add_native_flag)
+  set(_varname "HAVE_CPU_NATIVE_SUPPORT")
+  ocv_check_compiler_flag(CXX "-march=native" "${_varname}" "")
+  if(_varname)
+    set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} -march=native")
+  else()
+    set(_varname "HAVE_CPU_HOST_SUPPORT")
+    if(MSVC)
+      set(_flag "/QxHost")
+    else()
+      set(_flag "-xHost")
+    endif()
+    ocv_check_compiler_flag(CXX "${_flag}" "${_varname}" "")
+    if(_varname)
+      set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} ${flag}")
+    endif()
+  endif()
+endif()
+
+foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
+  set(__is_disabled 0)
+  foreach(OPT2 ${CPU_BASELINE_DISABLE})
+    ocv_is_optimization_in_list(__is_disabled ${OPT2} ${OPT})
+    if(__is_disabled)
+      break()
+    endif()
+  endforeach()
+  if(__is_disabled)
+    set(__is_from_baseline 0)
+  else()
+    ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE_REQUIRE})
+    if(NOT __is_from_baseline)
+      ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE})
+    endif()
+  endif()
+  ocv_is_optimization_in_list(__is_from_dispatch ${OPT} ${CPU_DISPATCH_REQUIRE})
+  if(NOT __is_from_dispatch)
+    ocv_is_optimization_in_list(__is_from_dispatch ${OPT} ${CPU_DISPATCH})
+  endif()
+  if(__is_from_dispatch OR __is_from_baseline OR CPU_BASELINE_DETECT)
+    ocv_check_compiler_optimization(${OPT})
+  endif()
+  if(CPU_BASELINE_DETECT AND NOT __is_from_baseline AND NOT __is_disabled)
+    ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE_FINAL})
+  endif()
+  if(CPU_${OPT}_SUPPORTED)
+    if(";${CPU_DISPATCH};" MATCHES ";${OPT};" AND NOT __is_from_baseline)
+      list(APPEND CPU_DISPATCH_FINAL ${OPT})
+    elseif(__is_from_baseline AND NOT CPU_BASELINE_DETECT)
+      list(APPEND CPU_BASELINE_FINAL ${OPT})
+      ocv_append_optimization_flag(CPU_BASELINE_FLAGS ${OPT})
+    endif()
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_BASELINE_REQUIRE})
+  if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_BASELINE})
+  if(OPT STREQUAL "DETECT" OR OPT STREQUAL "HOST" OR OPT STREQUAL "NATIVE")
+    # nothing
+  elseif(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(STATUS "Optimization ${OPT} is not available, skipped")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_DISPATCH_REQUIRE})
+  if(";${CPU_DISPATCH_FINAL};" MATCHES ";${OPT};")
+    # OK
+  elseif(";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(SEND_ERROR "Dispatched optimization ${OPT} is in baseline list (CPU_DISPATCH_REQUIRE=${CPU_DISPATCH_REQUIRE})")
+  else()
+    message(SEND_ERROR "Required dispatch optimization is not supported: ${OPT} (CPU_DISPATCH_REQUIRE=${CPU_DISPATCH_REQUIRE})")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_DISPATCH})
+  if(";${CPU_DISPATCH_FINAL};" MATCHES ";${OPT};")
+    # OK
+  elseif(";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    # OK
+  else()
+    message(STATUS "Dispatch optimization ${OPT} is not available, skipped")
+  endif()
+endforeach()
+
+#message(STATUS "CPU_BASELINE_FINAL=${CPU_BASELINE_FINAL}")
+#message(STATUS "CPU_DISPATCH_FINAL=${CPU_DISPATCH_FINAL}")
+
+#if(CPU_DISPATCH_FINAL AND NOT PYTHON_DEFAULT_EXECUTABLE)
+#  message(FATAL_ERROR "Python is required for CPU dispatched optimization support")
+#endif()
+
+macro(ocv_compiler_optimization_options)
+  set(__flags "${OPENCV_EXTRA_CXX_FLAGS} ${CPU_BASELINE_FLAGS}")
+  if(NOT __flags STREQUAL CACHED_CPU_BASELINE_FLAGS)
+    set(CACHED_CPU_BASELINE_FLAGS "${__flags}" CACHE INTERNAL "" FORCE)
+    ocv_clear_vars(HAVE_CPU_BASELINE_FLAGS)
+  endif()
+  ocv_check_compiler_flag(CXX "${__flags}" HAVE_CPU_BASELINE_FLAGS)
+  if(NOT HAVE_CPU_BASELINE_FLAGS)
+    message(FATAL_ERROR "Compiler doesn't support baseline optimization flags: ${CPU_BASELINE_FLAGS}")
+  endif()
+  add_extra_compiler_option_force("${CPU_BASELINE_FLAGS}")
+
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    set(__dispatch_flags "")
+    set(__dispatch_definitions "")
+    set(__dispatch_opts "")
+    set(__dispatch_opts_force "")
+    foreach(OPT2 ${CPU_KNOWN_OPTIMIZATIONS})
+      if(NOT CPU_${OPT2}_SUPPORTED)
+        #continue()
+      else()
+      ocv_is_optimization_in_list(__is_from_baseline ${OPT2} ${CPU_BASELINE_FINAL})
+      if(NOT __is_from_baseline)
+        ocv_is_optimization_in_list(__is_active ${OPT2} ${OPT})
+        if(__is_active)
+          ocv_append_optimization_flag(__dispatch_flags ${OPT2})
+          list(APPEND __dispatch_definitions "CV_CPU_COMPILE_${OPT2}=1")
+          list(APPEND __dispatch_opts "${OPT2}")
+        endif()
+        ocv_is_optimization_in_force_list(__is_force ${OPT2} ${OPT})
+        if(__is_force)
+          list(APPEND __dispatch_opts_force "${OPT2}")
+        endif()
+      endif()
+      endif()
+    endforeach()
+    set(__flags "${OPENCV_EXTRA_CXX_FLAGS} ${__dispatch_flags}")
+    if(NOT __flags STREQUAL CACHED_CPU_DISPATCH_${OPT}_FLAGS)
+      set(CACHED_CPU_DISPATCH_${OPT}_FLAGS "${__flags}" CACHE INTERNAL "" FORCE)
+      ocv_clear_vars(HAVE_CPU_DISPATCH_FLAGS_${OPT})
+    endif()
+    ocv_check_compiler_flag(CXX "${__flags}" HAVE_CPU_DISPATCH_FLAGS_${OPT})
+    if(NOT HAVE_CPU_DISPATCH_FLAGS_${OPT})
+      message(FATAL_ERROR "Compiler doesn't support optimization flags for ${OPT} dispatch mode: ${__dispatch_flags}")
+    endif()
+    set(CPU_DISPATCH_FLAGS_${OPT} "${__dispatch_flags}")
+    set(CPU_DISPATCH_DEFINITIONS_${OPT} "${__dispatch_definitions}")
+    set(CPU_DISPATCH_${OPT}_INCLUDED "${__dispatch_opts}")
+    set(CPU_DISPATCH_${OPT}_FORCED "${__dispatch_opts_force}")
+  endforeach()
+
+  if(ENABLE_POWERPC)
+    add_extra_compiler_option("-mcpu=G3 -mtune=G5")
+  endif()
+  if(ARM)
+    add_extra_compiler_option("-mfp16-format=ieee")
+  endif(ARM)
+  if(ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=neon")
+  endif()
+  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=vfpv3")
+  endif()
+endmacro()
+
+macro(ocv_compiler_optimization_options_finalize)
+  if(CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64))
+    if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4)
+      if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)")
+        add_extra_compiler_option(-mfpmath=sse) # !! important - be on the same wave with x64 compilers
+      else()
+        add_extra_compiler_option(-mfpmath=387)
+      endif()
+    endif()
+  endif()
+
+  if(MSVC)
+    # Generate Intrinsic Functions
+    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
+
+    if((X86 OR X86_64) AND CMAKE_SIZEOF_VOID_P EQUAL 4 AND ";${CPU_BASELINE_FINAL};" MATCHES ";SSE;")
+      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
+    endif()
+  endif(MSVC)
+endmacro()
+
+macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME TARGET_BASE_NAME)
+  set(__result "")
+  set(__result_libs "")
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    set(__result_${OPT} "")
+  endforeach()
+  foreach(fname ${${SOURCES_VAR_NAME}})
+    string(TOLOWER "${fname}" fname_LOWER)
+    if(fname_LOWER MATCHES "[.]opt_.*[.]cpp$")
+      if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
+        message(STATUS "Excluding from source files list: ${fname}")
+        #continue()
+      else()
+        set(__opt_found 0)
+        foreach(OPT ${CPU_BASELINE_FINAL})
+          string(TOLOWER "${OPT}" OPT_LOWER)
+          if(fname_LOWER MATCHES "_${OPT_LOWER}[.]cpp$")
+#message("${fname} BASELINE-${OPT}")
+            set(__opt_found 1)
+            list(APPEND __result "${fname}")
+            break()
+          endif()
+        endforeach()
+        foreach(OPT ${CPU_DISPATCH_FINAL})
+          foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED})
+            string(TOLOWER "${OPT2}" OPT2_LOWER)
+            if(fname_LOWER MATCHES "_${OPT2_LOWER}[.]cpp$")
+              list(APPEND __result_${OPT} "${fname}")
+              math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1")
+              set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE)
+#message("${fname} ${OPT}")
+#message("    ${CPU_DISPATCH_${OPT}_INCLUDED}")
+#message("    ${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+#message("    ${CPU_DISPATCH_FLAGS_${OPT}}")
+              set(__opt_found 1)
+              break()
+            endif()
+          endforeach()
+          if(__opt_found)
+            set(__opt_found 1)
+            break()
+          endif()
+        endforeach()
+        if(NOT __opt_found)
+          message(STATUS "Excluding from source files list: ${fname}")
+        endif()
+      endif()
+    else()
+      list(APPEND __result "${fname}")
+    endif()
+  endforeach()
+
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    if(__result_${OPT})
+#message("${OPT}: ${__result_${OPT}}")
+      if(CMAKE_GENERATOR MATCHES "^Visual")
+        # extra flags are added before common flags, so switching between optimizations doesn't work correctly
+        # Also CMAKE_CXX_FLAGS doesn't work (it is directory-based, so add_subdirectory is required)
+        add_library(${TARGET_BASE_NAME}_${OPT} OBJECT ${__result_${OPT}})
+        ocv_append_dependant_targets(${TARGET_BASE_NAME} ${TARGET_BASE_NAME}_${OPT})
+        set_target_properties(${TARGET_BASE_NAME}_${OPT} PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+        set_target_properties(${TARGET_BASE_NAME}_${OPT} PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
+        #list(APPEND __result_libs ${TARGET_BASE_NAME}_${OPT})
+        list(APPEND __result "$<TARGET_OBJECTS:${TARGET_BASE_NAME}_${OPT}>")
+      else()
+        foreach(fname ${__result_${OPT}})
+          set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+          set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
+        endforeach()
+        list(APPEND __result ${__result_${OPT}})
+      endif()
+    endif()
+  endforeach()
+  set(${SOURCES_VAR_NAME} "${__result}")
+  list(APPEND ${LIBS_VAR_NAME} ${__result_libs})
+endmacro()
+
+macro(ocv_compiler_optimization_fill_cpu_config)
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "")
+  foreach(OPT ${CPU_BASELINE_FINAL})
+    set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_COMPILE_${OPT} 1
+#define CV_CPU_BASELINE_COMPILE_${OPT} 1
+")
+  endforeach()
+
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_BASELINE_FEATURES 0 \\")
+  foreach(OPT ${CPU_BASELINE_FINAL})
+    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
+      set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+    , CV_CPU_${OPT} \\")
+    endif()
+  endforeach()
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}\n")
+
+  set(__dispatch_modes "")
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT})
+  endforeach()
+  list(REMOVE_DUPLICATES __dispatch_modes)
+  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "")
+  foreach(OPT ${__dispatch_modes})
+    set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_DISPATCH_COMPILE_${OPT} 1")
+  endforeach()
+
+  set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n")
+  foreach(OPT ${CPU_ALL_OPTIMIZATIONS})
+    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
+      set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
+#  define CV_CPU_HAS_SUPPORT_${OPT} 1
+#  define CV_CPU_CALL_${OPT}(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
+#  define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
+#  define CV_CPU_CALL_${OPT}(...) if (CV_CPU_HAS_SUPPORT_${OPT}) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_${OPT} 0
+#  define CV_CPU_CALL_${OPT}(...)
+#endif
+")
+    endif()
+  endforeach()
+
+  set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
+  if(EXISTS "${__file}")
+    file(READ "${__file}" __content)
+  endif()
+  if(__content STREQUAL OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE)
+    #message(STATUS "${__file} contains same content")
+  else()
+    file(WRITE "${__file}" "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}")
+    message(WARNING "${__file} is updated")
+  endif()
+endmacro()
+
+if(CV_DISABLE_OPTIMIZATION OR CV_ICC)
+  ocv_update(CV_ENABLE_UNROLLED 0)
+else()
+  ocv_update(CV_ENABLE_UNROLLED 1)
+endif()
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 5bb0479113..1656840441 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -31,24 +31,21 @@ endif()
 if(MINGW OR (X86 AND UNIX AND NOT APPLE))
   # mingw compiler is known to produce unstable SSE code with -O3 hence we are trying to use -O2 instead
   if(CMAKE_COMPILER_IS_GNUCXX)
-    foreach(flags CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-      string(REPLACE "-O3" "-O2" ${flags} "${${flags}}")
-    endforeach()
-  endif()
-
-  if(CMAKE_COMPILER_IS_GNUCC)
-    foreach(flags CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG)
+    foreach(flags
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG)
       string(REPLACE "-O3" "-O2" ${flags} "${${flags}}")
     endforeach()
   endif()
 endif()
 
 if(MSVC)
-  string(REGEX REPLACE "^  *| * $" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REGEX REPLACE "^  *| * $" "" CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT}")
+  string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS)
+  string(STRIP "${CMAKE_CXX_FLAGS_INIT}" CMAKE_CXX_FLAGS_INIT)
   if(CMAKE_CXX_FLAGS STREQUAL CMAKE_CXX_FLAGS_INIT)
     # override cmake default exception handling option
-    string(REPLACE "/EHsc" "/EHa" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "/EHsc" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHa")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}"  CACHE STRING "Flags used by the compiler during all build types." FORCE)
   endif()
 endif()
@@ -63,9 +60,6 @@ set(OPENCV_EXTRA_EXE_LINKER_FLAGS_RELEASE "")
 set(OPENCV_EXTRA_EXE_LINKER_FLAGS_DEBUG "")
 
 macro(add_extra_compiler_option option)
-  if(CMAKE_BUILD_TYPE)
-    set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
-  endif()
   ocv_check_flag_support(CXX "${option}" _varname "${OPENCV_EXTRA_CXX_FLAGS} ${ARGN}")
   if(${_varname})
     set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} ${option}")
@@ -77,6 +71,12 @@ macro(add_extra_compiler_option option)
   endif()
 endmacro()
 
+macro(add_extra_compiler_option_force option)
+  set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} ${option}")
+  set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} ${option}")
+endmacro()
+
+
 # Gets environment variable and puts its value to the corresponding preprocessor definition
 # Useful for WINRT that has no access to environment variables
 macro(add_env_definitions option)
@@ -102,7 +102,11 @@ if(MINGW)
 endif()
 
 if(CV_ICC AND NOT ENABLE_FAST_MATH)
-  add_extra_compiler_option("-fp-model precise")
+  if(MSVC)
+    add_extra_compiler_option("/fp:precise")
+  else()
+    add_extra_compiler_option("-fp-model precise")
+  endif()
 endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
@@ -141,7 +145,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   endif()
 
   # We need pthread's
-  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX))
+  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX)) # TODO
     add_extra_compiler_option(-pthread)
   endif()
 
@@ -170,83 +174,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(ENABLE_FAST_MATH)
     add_extra_compiler_option(-ffast-math)
   endif()
-  if(ENABLE_POWERPC)
-    add_extra_compiler_option("-mcpu=G3 -mtune=G5")
-  endif()
-  if(ENABLE_SSE)
-    add_extra_compiler_option(-msse)
-  endif()
-  if(ENABLE_SSE2)
-    add_extra_compiler_option(-msse2)
-  elseif(X86 OR X86_64)
-    add_extra_compiler_option(-mno-sse2)
-  endif()
-  if(ARM)
-    add_extra_compiler_option("-mfp16-format=ieee")
-  endif(ARM)
-  if(ENABLE_NEON)
-    add_extra_compiler_option("-mfpu=neon")
-  endif()
-  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
-    add_extra_compiler_option("-mfpu=vfpv3")
-  endif()
-
-  # SSE3 and further should be disabled under MingW because it generates compiler errors
-  if(NOT MINGW)
-    if(ENABLE_AVX)
-      add_extra_compiler_option(-mavx)
-    elseif(X86 OR X86_64)
-      add_extra_compiler_option(-mno-avx)
-    endif()
-    if(ENABLE_AVX2)
-      add_extra_compiler_option(-mavx2)
-
-      if(ENABLE_FMA3)
-        add_extra_compiler_option(-mfma)
-      endif()
-    endif()
-
-    # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
-    if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
-      if(ENABLE_SSE3)
-        add_extra_compiler_option(-msse3)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse3)
-      endif()
-
-      if(ENABLE_SSSE3)
-        add_extra_compiler_option(-mssse3)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-ssse3)
-      endif()
-
-      if(ENABLE_SSE41)
-        add_extra_compiler_option(-msse4.1)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse4.1)
-      endif()
-
-      if(ENABLE_SSE42)
-        add_extra_compiler_option(-msse4.2)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse4.2)
-      endif()
-
-      if(ENABLE_POPCNT)
-        add_extra_compiler_option(-mpopcnt)
-      endif()
-    endif()
-  endif(NOT MINGW)
-
-  if(X86 OR X86_64)
-    if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4)
-      if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)")
-        add_extra_compiler_option(-mfpmath=sse)# !! important - be on the same wave with x64 compilers
-      else()
-        add_extra_compiler_option(-mfpmath=387)
-      endif()
-    endif()
-  endif()
 
   # Profiling?
   if(ENABLE_PROFILING)
@@ -257,7 +184,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
       string(REPLACE "-fomit-frame-pointer" "" ${flags} "${${flags}}")
       string(REPLACE "-ffunction-sections" "" ${flags} "${${flags}}")
     endforeach()
-  elseif(NOT APPLE AND NOT ANDROID)
+  elseif(NOT ((IOS OR ANDROID) AND NOT BUILD_SHARED_LIBS))
     # Remove unreferenced functions: function level linking
     add_extra_compiler_option(-ffunction-sections)
   endif()
@@ -265,6 +192,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(ENABLE_COVERAGE)
     set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} --coverage")
     set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} --coverage")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage")
   endif()
 
   if(ENABLE_INSTRUMENTATION)
@@ -296,41 +224,6 @@ if(MSVC)
     set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
   endif()
 
-  if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800)
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2")
-  endif()
-  if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX")
-  endif()
-
-  if(ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE4.1")
-  endif()
-
-  if(ENABLE_SSE3 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE3")
-  endif()
-
-  if(NOT MSVC64)
-    # 64-bit MSVC compiler uses SSE/SSE2 by default
-    if(ENABLE_SSE2 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE2")
-    endif()
-    if(ENABLE_SSE AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE")
-    endif()
-  endif()
-
-  if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2)
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
-  endif()
-
-  if(X86 OR X86_64)
-    if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND ENABLE_SSE2)
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
-    endif()
-  endif()
-
   if(OPENCV_WARNINGS_ARE_ERRORS)
     set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /WX")
   endif()
@@ -353,6 +246,16 @@ if(NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX AND NOT ANDROID)
   set(OPENCV_EXTRA_FLAGS "-fPIC ${OPENCV_EXTRA_FLAGS}")
 endif()
 
+include(cmake/OpenCVCompilerOptimizations.cmake)
+
+if(COMMAND ocv_compiler_optimization_options)
+  ocv_compiler_optimization_options()
+endif()
+
+if(COMMAND ocv_compiler_optimization_options_finalize)
+  ocv_compiler_optimization_options_finalize()
+endif()
+
 # Add user supplied extra options (optimization, etc...)
 # ==========================================================
 set(OPENCV_EXTRA_FLAGS         "${OPENCV_EXTRA_FLAGS}"         CACHE INTERNAL "Extra compiler options")
@@ -370,6 +273,7 @@ if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_OPENCV_GCC_VERSION_NUM GREATER 399)
   add_extra_compiler_option(-fvisibility-inlines-hidden)
 endif()
 
+# TODO !!!!!
 if(NOT OPENCV_FP16_DISABLE AND NOT IOS)
   if(ARM AND ENABLE_NEON)
     set(FP16_OPTION "-mfpu=neon-fp16")
@@ -378,7 +282,7 @@ if(NOT OPENCV_FP16_DISABLE AND NOT IOS)
   endif()
   try_compile(__VALID_FP16
     "${OpenCV_BINARY_DIR}"
-    "${OpenCV_SOURCE_DIR}/cmake/checks/fp16.cpp"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp"
     COMPILE_DEFINITIONS "-DCHECK_FP16" "${FP16_OPTION}"
     OUTPUT_VARIABLE TRY_OUT
     )
diff --git a/cmake/OpenCVFindMKL.cmake b/cmake/OpenCVFindMKL.cmake
index fd95dcb446..dbe0482c8b 100644
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@@ -48,7 +48,7 @@ endif()
 #check current MKL_ROOT_DIR
 if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
     set(mkl_root_paths ${MKL_ROOT_DIR})
-    if(DEFINED $ENV{MKLROOT})
+    if(DEFINED ENV{MKLROOT})
         list(APPEND mkl_root_paths $ENV{MKLROOT})
     endif()
     if(WIN32)
diff --git a/cmake/OpenCVGenHeaders.cmake b/cmake/OpenCVGenHeaders.cmake
index 2988979045..477b910558 100644
--- a/cmake/OpenCVGenHeaders.cmake
+++ b/cmake/OpenCVGenHeaders.cmake
@@ -3,6 +3,10 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cvconfig.h.in" "${OPENCV_CO
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cvconfig.h.in" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/cvconfig.h")
 install(FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2 COMPONENT dev)
 
+# platform-specific config file
+ocv_compiler_optimization_fill_cpu_config()
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cv_cpu_config.h.in" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cv_cpu_config.h")
+
 # ----------------------------------------------------------------------------
 #  opencv_modules.hpp based on actual modules list
 # ----------------------------------------------------------------------------
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index ce2bc7e08a..10e1f7397c 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -65,6 +65,7 @@ foreach(mod ${OPENCV_MODULES_BUILD} ${OPENCV_MODULES_DISABLED_USER} ${OPENCV_MOD
   unset(OPENCV_MODULE_${mod}_PRIVATE_OPT_DEPS CACHE)
   unset(OPENCV_MODULE_${mod}_LINK_DEPS CACHE)
   unset(OPENCV_MODULE_${mod}_WRAPPERS CACHE)
+  unset(OPENCV_DEPENDANT_TARGETS_${mod} CACHE)
 endforeach()
 
 # clean modules info which needs to be recalculated
@@ -648,6 +649,8 @@ macro(ocv_set_module_sources)
   # use full paths for module to be independent from the module location
   ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)
 
+  ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
+
   set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
   set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
 endmacro()
diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake
index 29f21d8015..6a83218729 100644
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -65,6 +65,9 @@ MACRO(_PCH_GET_COMPILE_FLAGS _out_compile_flags)
         ocv_is_opencv_directory(__result ${item})
         if(__result)
           LIST(APPEND ${_out_compile_flags} "${_PCH_include_prefix}\"${item}\"")
+        elseif(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
+               item MATCHES "/usr/include$")
+          # workaround for GCC 6.x bug
         else()
           LIST(APPEND ${_out_compile_flags} "${_PCH_isystem_prefix}\"${item}\"")
         endif()
@@ -75,6 +78,9 @@ MACRO(_PCH_GET_COMPILE_FLAGS _out_compile_flags)
         ocv_is_opencv_directory(__result ${item})
         if(__result)
           LIST(APPEND ${_out_compile_flags} "${_PCH_include_prefix}\"${item}\"")
+        elseif(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
+               item MATCHES "/usr/include$")
+          # workaround for GCC 6.x bug
         else()
           LIST(APPEND ${_out_compile_flags} "${_PCH_isystem_prefix}\"${item}\"")
         endif()
@@ -328,7 +334,10 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
 
         get_target_property(_sources ${_targetName} SOURCES)
         foreach(src ${_sources})
-          if(NOT "${src}" MATCHES "\\.mm$")
+          if(NOT "${src}" MATCHES "\\.mm$"
+               AND NOT "${src}" MATCHES "\\.h$" AND NOT "${src}" MATCHES "\\.hpp$" # header files
+               AND NOT "${src}" MATCHES "^\$" # CMake generator expressions
+          )
             get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
             if(NOT oldProps)
               set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"")
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 474f7db609..4c065d66f4 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -37,7 +37,11 @@ endmacro()
 
 macro(ocv_update VAR)
   if(NOT DEFINED ${VAR})
-    set(${VAR} ${ARGN})
+    if("x${ARGN}" STREQUAL "x")
+      set(${VAR} "")
+    else()
+      set(${VAR} ${ARGN})
+    endif()
   else()
     #ocv_debug_message("Preserve old value for ${VAR}: ${${VAR}}")
   endif()
@@ -151,8 +155,15 @@ function(ocv_append_target_property target prop)
   endif()
 endfunction()
 
+function(ocv_append_dependant_targets target)
+  #ocv_debug_message("ocv_append_dependant_targets(${target} ${ARGN})")
+  _ocv_fix_target(target)
+  set(OPENCV_DEPENDANT_TARGETS_${target} "${OPENCV_DEPENDANT_TARGETS_${target}};${ARGN}" CACHE INTERNAL "" FORCE)
+endfunction()
+
 # adds include directories in such way that directories from the OpenCV source tree go first
 function(ocv_target_include_directories target)
+  #ocv_debug_message("ocv_target_include_directories(${target} ${ARGN})")
   _ocv_fix_target(target)
   set(__params "")
   if(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
@@ -173,6 +184,11 @@ function(ocv_target_include_directories target)
   else()
     if(TARGET ${target})
       target_include_directories(${target} PRIVATE ${__params})
+      if(OPENCV_DEPENDANT_TARGETS_${target})
+        foreach(t ${OPENCV_DEPENDANT_TARGETS_${target}})
+          target_include_directories(${t} PRIVATE ${__params})
+        endforeach()
+      endif()
     else()
       set(__new_inc "${OCV_TARGET_INCLUDE_DIRS_${target}};${__params}")
       set(OCV_TARGET_INCLUDE_DIRS_${target} "${__new_inc}" CACHE INTERNAL "")
@@ -205,8 +221,11 @@ set(OCV_COMPILER_FAIL_REGEX
   )
 
 MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
+  set(_fname "${ARGN}")
   if(NOT DEFINED ${RESULT})
-    if("_${LANG}_" MATCHES "_CXX_")
+    if(_fname)
+      # nothing
+    elseif("_${LANG}_" MATCHES "_CXX_")
       set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx")
       if("${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror " OR "${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror=unknown-pragmas ")
         FILE(WRITE "${_fname}" "int main() { return 0; }\n")
@@ -231,10 +250,17 @@ MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
       unset(_fname)
     endif()
     if(_fname)
-      MESSAGE(STATUS "Performing Test ${RESULT}")
+      if(NOT "x${ARGN}" STREQUAL "x")
+        file(RELATIVE_PATH __msg "${CMAKE_SOURCE_DIR}" "${ARGN}")
+        set(__msg " (check file: ${__msg})")
+      else()
+        set(__msg "")
+      endif()
+      MESSAGE(STATUS "Performing Test ${RESULT}${__msg}")
       TRY_COMPILE(${RESULT}
         "${CMAKE_BINARY_DIR}"
         "${_fname}"
+        CMAKE_FLAGS "-DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}"   # CMP0056 do this on new CMake
         COMPILE_DEFINITIONS "${FLAG}"
         OUTPUT_VARIABLE OUTPUT)
 
@@ -278,7 +304,11 @@ MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
   endif()
 ENDMACRO()
 
-macro(ocv_check_flag_support lang flag varname)
+macro(ocv_check_flag_support lang flag varname base_options)
+  if(CMAKE_BUILD_TYPE)
+    set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
+  endif()
+
   if("_${lang}_" MATCHES "_CXX_")
     set(_lang CXX)
   elseif("_${lang}_" MATCHES "_C_")
@@ -293,7 +323,7 @@ macro(ocv_check_flag_support lang flag varname)
   string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}")
   string(REGEX REPLACE " -|-|=| |\\." "_" ${varname} "${${varname}}")
 
-  ocv_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}})
+  ocv_check_compiler_flag("${_lang}" "${base_options} ${flag}" ${${varname}} ${ARGN})
 endmacro()
 
 # turns off warnings
@@ -327,7 +357,7 @@ macro(ocv_warnings_disable)
             string(REPLACE "${warning}" "" ${var} "${${var}}")
             string(REPLACE "-W" "-Wno-" warning "${warning}")
           endif()
-          ocv_check_flag_support(${var} "${warning}" _varname)
+          ocv_check_flag_support(${var} "${warning}" _varname "")
           if(${_varname})
             set(${var} "${${var}} ${warning}")
           endif()
@@ -342,7 +372,7 @@ macro(ocv_warnings_disable)
           else()
             string(REPLACE "-wd" "-Qwd" warning "${warning}")
           endif()
-          ocv_check_flag_support(${var} "${warning}" _varname)
+          ocv_check_flag_support(${var} "${warning}" _varname "")
           if(${_varname})
             set(${var} "${${var}} ${warning}")
           endif()
@@ -357,7 +387,7 @@ macro(ocv_warnings_disable)
 endmacro()
 
 macro(add_apple_compiler_options the_module)
-  ocv_check_flag_support(OBJCXX "-fobjc-exceptions" HAVE_OBJC_EXCEPTIONS)
+  ocv_check_flag_support(OBJCXX "-fobjc-exceptions" HAVE_OBJC_EXCEPTIONS "")
   if(HAVE_OBJC_EXCEPTIONS)
     foreach(source ${OPENCV_MODULE_${the_module}_SOURCES})
       if("${source}" MATCHES "\\.mm$")
@@ -903,6 +933,11 @@ function(_ocv_append_target_includes target)
     if (TARGET ${target}_object)
       target_include_directories(${target}_object PRIVATE ${OCV_TARGET_INCLUDE_DIRS_${target}})
     endif()
+    if(OPENCV_DEPENDANT_TARGETS_${target})
+      foreach(t ${OPENCV_DEPENDANT_TARGETS_${target}})
+        target_include_directories(${t} PRIVATE ${OCV_TARGET_INCLUDE_DIRS_${target}})
+      endforeach()
+    endif()
     unset(OCV_TARGET_INCLUDE_DIRS_${target} CACHE)
   endif()
 endfunction()
diff --git a/cmake/checks/cpu_avx.cpp b/cmake/checks/cpu_avx.cpp
new file mode 100644
index 0000000000..05536f443f
--- /dev/null
+++ b/cmake/checks/cpu_avx.cpp
@@ -0,0 +1,9 @@
+#if !defined __AVX__ // MSVC supports this flag since MSVS 2013
+#error "__AVX__ define is missing"
+#endif
+#include <immintrin.h>
+void test()
+{
+    __m256 a = _mm256_set1_ps(0.0f);
+}
+int main() { return 0; }
diff --git a/cmake/checks/cpu_avx2.cpp b/cmake/checks/cpu_avx2.cpp
new file mode 100644
index 0000000000..3ab1143b8f
--- /dev/null
+++ b/cmake/checks/cpu_avx2.cpp
@@ -0,0 +1,10 @@
+#if !defined __AVX2__ // MSVC supports this flag since MSVS 2013
+#error "__AVX2__ define is missing"
+#endif
+#include <immintrin.h>
+void test()
+{
+    int data[8] = {0,0,0,0, 0,0,0,0};
+    __m256i a = _mm256_loadu_si256((const __m256i *)data);
+}
+int main() { return 0; }
diff --git a/cmake/checks/cpu_avx512.cpp b/cmake/checks/cpu_avx512.cpp
new file mode 100644
index 0000000000..d0898ab3ee
--- /dev/null
+++ b/cmake/checks/cpu_avx512.cpp
@@ -0,0 +1,10 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i zmm = _mm512_setzero_si512();
+}
+#else
+#error "AVX512 is not supported"
+#endif
+int main() { return 0; }
diff --git a/cmake/checks/fp16.cpp b/cmake/checks/cpu_fp16.cpp
similarity index 86%
rename from cmake/checks/fp16.cpp
rename to cmake/checks/cpu_fp16.cpp
index c77c844834..6951f1c4f7 100644
--- a/cmake/checks/fp16.cpp
+++ b/cmake/checks/cpu_fp16.cpp
@@ -1,6 +1,6 @@
 #include <stdio.h>
 
-#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700)
+#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700) || (defined __INTEL_COMPILER && defined __AVX__)
 #include <immintrin.h>
 int test()
 {
diff --git a/cmake/checks/cpu_popcnt.cpp b/cmake/checks/cpu_popcnt.cpp
new file mode 100644
index 0000000000..f55c9f3c54
--- /dev/null
+++ b/cmake/checks/cpu_popcnt.cpp
@@ -0,0 +1,8 @@
+#include <nmmintrin.h>
+#ifndef _MSC_VER
+#include <popcntintrin.h>
+#endif
+int main() {
+    int i = _mm_popcnt_u64(1);
+    return 0;
+}
diff --git a/cmake/checks/cpu_sse.cpp b/cmake/checks/cpu_sse.cpp
new file mode 100644
index 0000000000..c6269acdb5
--- /dev/null
+++ b/cmake/checks/cpu_sse.cpp
@@ -0,0 +1,2 @@
+#include <xmmintrin.h>
+int main() { return 0; }
diff --git a/cmake/checks/cpu_sse2.cpp b/cmake/checks/cpu_sse2.cpp
new file mode 100644
index 0000000000..68a69f88cb
--- /dev/null
+++ b/cmake/checks/cpu_sse2.cpp
@@ -0,0 +1,2 @@
+#include <emmintrin.h>
+int main() { return 0; }
diff --git a/cmake/checks/cpu_sse3.cpp b/cmake/checks/cpu_sse3.cpp
new file mode 100644
index 0000000000..98ce2191ec
--- /dev/null
+++ b/cmake/checks/cpu_sse3.cpp
@@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+int main() {
+    __m128 u, v;
+    u = _mm_set1_ps(0.0f);
+    v = _mm_moveldup_ps(u); // SSE3
+    return 0;
+}
diff --git a/cmake/checks/cpu_sse41.cpp b/cmake/checks/cpu_sse41.cpp
new file mode 100644
index 0000000000..ddd835b0e7
--- /dev/null
+++ b/cmake/checks/cpu_sse41.cpp
@@ -0,0 +1,6 @@
+#include <smmintrin.h>
+int main() {
+    __m128i a = _mm_setzero_si128(), b = _mm_setzero_si128();
+    __m128i c = _mm_packus_epi32(a, b);
+    return 0;
+}
diff --git a/cmake/checks/cpu_sse42.cpp b/cmake/checks/cpu_sse42.cpp
new file mode 100644
index 0000000000..56f56658ab
--- /dev/null
+++ b/cmake/checks/cpu_sse42.cpp
@@ -0,0 +1,5 @@
+#include <nmmintrin.h>
+int main() {
+    int i = _mm_popcnt_u64(1);
+    return 0;
+}
diff --git a/cmake/checks/cpu_ssse3.cpp b/cmake/checks/cpu_ssse3.cpp
new file mode 100644
index 0000000000..e583199bcd
--- /dev/null
+++ b/cmake/checks/cpu_ssse3.cpp
@@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+const double v = 0;
+int main() {
+    __m128i a = _mm_setzero_si128();
+    __m128i b = _mm_abs_epi32(a);
+    return 0;
+}
diff --git a/cmake/checks/openvx_refenum_test.cpp b/cmake/checks/openvx_refenum_test.cpp
new file mode 100644
index 0000000000..f28db050cb
--- /dev/null
+++ b/cmake/checks/openvx_refenum_test.cpp
@@ -0,0 +1,5 @@
+#include <VX/vx.h>
+int main()
+{
+    return VX_REFERENCE_COUNT == VX_REFERENCE_TYPE ? VX_REFERENCE_NAME : 0;
+}
diff --git a/cmake/cl2cpp.cmake b/cmake/cl2cpp.cmake
index ed5dcb8761..dfcc2e6833 100644
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@@ -9,7 +9,7 @@ if (NOT cl_list)
   message(FATAL_ERROR "Can't find OpenCL kernels in directory: ${CL_DIR}")
 endif()
 
-string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
+string(REGEX REPLACE "\\.cpp$" ".hpp" OUTPUT_HPP "${OUTPUT}")
 get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)
 
 if("${MODULE_NAME}" STREQUAL "ocl")
diff --git a/cmake/templates/cv_cpu_config.h.in b/cmake/templates/cv_cpu_config.h.in
new file mode 100644
index 0000000000..27b27315cf
--- /dev/null
+++ b/cmake/templates/cv_cpu_config.h.in
@@ -0,0 +1,5 @@
+// OpenCV CPU baseline features
+@OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE@
+
+// OpenCV supported CPU dispatched features
+@OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE@
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 05add9e2c5..658d12c14c 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -1,6 +1,15 @@
+#ifndef OPENCV_CVCONFIG_H_INCLUDED
+#define OPENCV_CVCONFIG_H_INCLUDED
+
 /* OpenCV compiled as static or dynamic libs */
 #cmakedefine BUILD_SHARED_LIBS
 
+/* OpenCV intrinsics optimized code */
+#cmakedefine CV_ENABLE_INTRINSICS
+
+/* OpenCV additional optimized code */
+#cmakedefine CV_DISABLE_OPTIMIZATION
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
 
@@ -206,3 +215,7 @@
 
 /* OpenVX */
 #cmakedefine HAVE_OPENVX
+
+
+
+#endif // OPENCV_CVCONFIG_H_INCLUDED
diff --git a/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown b/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown
index 0b63515c53..432773d3b2 100644
--- a/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown
+++ b/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown
@@ -86,7 +86,7 @@ kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)
 
 # FLANN parameters
-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks=50)
 
diff --git a/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown b/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown
index 19992fc95b..6414dbed37 100644
--- a/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown
+++ b/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown
@@ -8,7 +8,7 @@ Learn to:
 
 -   Access pixel values and modify them
 -   Access image properties
--   Setting Region of Image (ROI)
+-   Setting Region of Interest (ROI)
 -   Splitting and Merging images
 
 Almost all the operations in this section is mainly related to Numpy rather than OpenCV. A good
diff --git a/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown b/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
index 4f5efa4a82..85547541f5 100644
--- a/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
+++ b/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
@@ -50,7 +50,7 @@ sift = cv2.xfeatures2d.SIFT_create()
 kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)
 
-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks = 50)
 
diff --git a/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown b/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown
index 166ffba4a1..3aa00b715a 100644
--- a/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown
+++ b/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown
@@ -10,7 +10,7 @@ corners are important etc.
 Explanation
 -----------
 
-Most of you will have played the jigsaw puzzle games. You get a lot of small pieces of a images,
+Most of you will have played the jigsaw puzzle games. You get a lot of small pieces of an image,
 where you need to assemble them correctly to form a big real image. **The question is, how you do
 it?** What about the projecting the same theory to a computer program so that computer can play
 jigsaw puzzles? If the computer can play jigsaw puzzles, why can't we give a lot of real-life images
diff --git a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
index a37d579944..a04715438b 100644
--- a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
+++ b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
@@ -148,11 +148,13 @@ its related parameters etc. First one is IndexParams. For various algorithms, th
 passed is explained in FLANN docs. As a summary, for algorithms like SIFT, SURF etc. you can pass
 following:
 @code{.py}
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 @endcode
 While using ORB, you can pass the following. The commented values are recommended as per the docs,
 but it didn't provide required results in some cases. Other values worked fine.:
 @code{.py}
+FLANN_INDEX_LSH = 6
 index_params= dict(algorithm = FLANN_INDEX_LSH,
                    table_number = 6, # 12
                    key_size = 12,     # 20
@@ -179,7 +181,7 @@ kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)
 
 # FLANN parameters
-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks=50)   # or pass empty dictionary
 
diff --git a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
index 6589a1c01a..53ed36ee68 100644
--- a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
+++ b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
@@ -19,8 +19,6 @@ Code
 
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp)
-. The second version (using LBP for face detection) can be [found
-here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp)
 @include samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
 
 Explanation
@@ -34,8 +32,8 @@ Result
 
     ![](images/Cascade_Classifier_Tutorial_Result_Haar.jpg)
 
-    Remember to copy the files *haarcascade_frontalface_alt.xml* and
-    *haarcascade_eye_tree_eyeglasses.xml* in your current directory. They are located in
+    Be sure the program will find the path of files *haarcascade_frontalface_alt.xml* and
+    *haarcascade_eye_tree_eyeglasses.xml*. They are located in
     *opencv/data/haarcascades*
 
 -#  This is the result of using the file *lbpcascade_frontalface.xml* (LBP trained) for the face
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 5a0e020d31..b5a56ec1b6 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -714,6 +714,30 @@ found, or as colored corners connected with lines if the board was found.
 CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSize,
                                          InputArray corners, bool patternWasFound );
 
+struct CV_EXPORTS_W_SIMPLE CirclesGridFinderParameters
+{
+    CV_WRAP CirclesGridFinderParameters();
+    CV_PROP_RW cv::Size2f densityNeighborhoodSize;
+    CV_PROP_RW float minDensity;
+    CV_PROP_RW int kmeansAttempts;
+    CV_PROP_RW int minDistanceToAddKeypoint;
+    CV_PROP_RW int keypointScale;
+    CV_PROP_RW float minGraphConfidence;
+    CV_PROP_RW float vertexGain;
+    CV_PROP_RW float vertexPenalty;
+    CV_PROP_RW float existingVertexGain;
+    CV_PROP_RW float edgeGain;
+    CV_PROP_RW float edgePenalty;
+    CV_PROP_RW float convexHullFactor;
+    CV_PROP_RW float minRNGEdgeSwitchDist;
+
+    enum GridType
+    {
+      SYMMETRIC_GRID, ASYMMETRIC_GRID
+    };
+    GridType gridType;
+};
+
 /** @brief Finds centers in the grid of circles.
 
 @param image grid view of input circles; it must be an 8-bit grayscale or color image.
@@ -726,6 +750,7 @@ CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSiz
 -   **CALIB_CB_CLUSTERING** uses a special algorithm for grid detection. It is more robust to
 perspective distortions but much more sensitive to background clutter.
 @param blobDetector feature detector that finds blobs like dark circles on light background.
+@param parameters struct for finding circles in a grid pattern.
 
 The function attempts to determine whether the input image contains a grid of circles. If it is, the
 function locates centers of the circles. The function returns a non-zero value if all of the centers
@@ -745,6 +770,12 @@ Sample usage of detecting and drawing the centers of circles: :
 @note The function requires white space (like a square-thick border, the wider the better) around
 the board to make the detection more robust in various environments.
  */
+CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
+                                   OutputArray centers, int flags,
+                                   const Ptr<FeatureDetector> &blobDetector,
+                                   CirclesGridFinderParameters parameters);
+
+/** @overload */
 CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
                                    OutputArray centers, int flags = CALIB_CB_SYMMETRIC_GRID,
                                    const Ptr<FeatureDetector> &blobDetector = SimpleBlobDetector::create());
@@ -1433,6 +1464,28 @@ CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray point
                             double focal = 1.0, Point2d pp = Point2d(0, 0),
                             InputOutputArray mask = noArray() );
 
+/** @overload
+@param E The input essential matrix.
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+Note that this function assumes that points1 and points2 are feature points from cameras with the
+same camera matrix.
+@param R Recovered relative rotation.
+@param t Recoverd relative translation.
+@param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite points).
+@param mask Input/output mask for inliers in points1 and points2.
+:   If it is not empty, then it marks inliers in points1 and points2 for then given essential
+matrix E. Only these inliers will be used to recover pose. In the output mask only inliers
+which pass the cheirality check.
+@param triangulatedPoints 3d points which were reconstructed by triangulation.
+ */
+
+CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
+                            InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
+                            OutputArray triangulatedPoints = noArray());
+
 /** @brief For points in an image of a stereo pair, computes the corresponding epilines in the other image.
 
 @param points Input points. \f$N \times 1\f$ or \f$1 \times N\f$ matrix of type CV_32FC2 or
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 2e9f07a274..059612d6de 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -442,7 +442,7 @@ int cvFindChessboardCorners( const void* arr, CvSize pattern_size,
 
     Mat img = cvarrToMat((CvMat*)arr).clone();
 
-    if( img.depth() != CV_8U || (img.channels() != 1 && img.channels() != 3) )
+    if( img.depth() != CV_8U || (img.channels() != 1 && img.channels() != 3 && img.channels() != 4) )
        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit grayscale or color images are supported" );
 
     if( pattern_size.width <= 2 || pattern_size.height <= 2 )
@@ -2093,7 +2093,8 @@ void cv::drawChessboardCorners( InputOutputArray _image, Size patternSize,
 }
 
 bool cv::findCirclesGrid( InputArray _image, Size patternSize,
-                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector )
+                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector,
+                          CirclesGridFinderParameters parameters)
 {
     CV_INSTRUMENT_REGION()
 
@@ -2120,13 +2121,6 @@ bool cv::findCirclesGrid( InputArray _image, Size patternSize,
       return !centers.empty();
     }
 
-    CirclesGridFinderParameters parameters;
-    parameters.vertexPenalty = -0.6f;
-    parameters.vertexGain = 1;
-    parameters.existingVertexGain = 10000;
-    parameters.edgeGain = 1;
-    parameters.edgePenalty = -0.6f;
-
     if(flags & CALIB_CB_ASYMMETRIC_GRID)
       parameters.gridType = CirclesGridFinderParameters::ASYMMETRIC_GRID;
     if(flags & CALIB_CB_SYMMETRIC_GRID)
@@ -2192,4 +2186,10 @@ bool cv::findCirclesGrid( InputArray _image, Size patternSize,
     return false;
 }
 
+bool cv::findCirclesGrid( InputArray _image, Size patternSize,
+                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector)
+{
+    return cv::findCirclesGrid(_image, patternSize, _centers, flags, blobDetector, CirclesGridFinderParameters());
+}
+
 /* End of file. */
diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp
index 2038e520a4..df85809d41 100644
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@@ -551,11 +551,11 @@ CirclesGridFinderParameters::CirclesGridFinderParameters()
   keypointScale = 1;
 
   minGraphConfidence = 9;
-  vertexGain = 2;
-  vertexPenalty = -5;
+  vertexGain = 1;
+  vertexPenalty = -0.6f;
   edgeGain = 1;
-  edgePenalty = -5;
-  existingVertexGain = 0;
+  edgePenalty = -0.6f;
+  existingVertexGain = 10000;
 
   minRNGEdgeSwitchDist = 5.f;
   gridType = SYMMETRIC_GRID;
diff --git a/modules/calib3d/src/circlesgrid.hpp b/modules/calib3d/src/circlesgrid.hpp
index fd1389298c..8f55f6c5ef 100644
--- a/modules/calib3d/src/circlesgrid.hpp
+++ b/modules/calib3d/src/circlesgrid.hpp
@@ -119,35 +119,11 @@ struct Path
   }
 };
 
-struct CirclesGridFinderParameters
-{
-  CirclesGridFinderParameters();
-  cv::Size2f densityNeighborhoodSize;
-  float minDensity;
-  int kmeansAttempts;
-  int minDistanceToAddKeypoint;
-  int keypointScale;
-  float minGraphConfidence;
-  float vertexGain;
-  float vertexPenalty;
-  float existingVertexGain;
-  float edgeGain;
-  float edgePenalty;
-  float convexHullFactor;
-  float minRNGEdgeSwitchDist;
-
-  enum GridType
-  {
-    SYMMETRIC_GRID, ASYMMETRIC_GRID
-  };
-  GridType gridType;
-};
-
 class CirclesGridFinder
 {
 public:
   CirclesGridFinder(cv::Size patternSize, const std::vector<cv::Point2f> &testKeypoints,
-                    const CirclesGridFinderParameters &parameters = CirclesGridFinderParameters());
+                    const cv::CirclesGridFinderParameters &parameters = cv::CirclesGridFinderParameters());
   bool findHoles();
   static cv::Mat rectifyGrid(cv::Size detectedGridSize, const std::vector<cv::Point2f>& centers, const std::vector<
       cv::Point2f> &keypoint, std::vector<cv::Point2f> &warpedKeypoints);
@@ -211,7 +187,7 @@ private:
   std::vector<std::vector<size_t> > *smallHoles;
 
   const cv::Size_<size_t> patternSize;
-  CirclesGridFinderParameters parameters;
+  cv::CirclesGridFinderParameters parameters;
 
   CirclesGridFinder& operator=(const CirclesGridFinder&);
   CirclesGridFinder(const CirclesGridFinder&);
diff --git a/modules/calib3d/src/five-point.cpp b/modules/calib3d/src/five-point.cpp
index 1d39e20f87..ecc1cfcf6f 100644
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@@ -458,8 +458,9 @@ cv::Mat cv::findEssentialMat( InputArray _points1, InputArray _points2, double f
     return cv::findEssentialMat(_points1, _points2, cameraMatrix, method, prob, threshold, _mask);
 }
 
-int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
-                     OutputArray _R, OutputArray _t, InputOutputArray _mask)
+int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2,
+                            InputArray _cameraMatrix, OutputArray _R, OutputArray _t, double distanceThresh,
+                     InputOutputArray _mask, OutputArray triangulatedPoints)
 {
     CV_INSTRUMENT_REGION()
 
@@ -506,51 +507,60 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
     // Notice here a threshold dist is used to filter
     // out far away points (i.e. infinite points) since
     // there depth may vary between postive and negtive.
-    double dist = 50.0;
+    std::vector<Mat> allTriangulations(4);
     Mat Q;
+
     triangulatePoints(P0, P1, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[0]);
     Mat mask1 = Q.row(2).mul(Q.row(3)) > 0;
     Q.row(0) /= Q.row(3);
     Q.row(1) /= Q.row(3);
     Q.row(2) /= Q.row(3);
     Q.row(3) /= Q.row(3);
-    mask1 = (Q.row(2) < dist) & mask1;
+    mask1 = (Q.row(2) < distanceThresh) & mask1;
     Q = P1 * Q;
     mask1 = (Q.row(2) > 0) & mask1;
-    mask1 = (Q.row(2) < dist) & mask1;
+    mask1 = (Q.row(2) < distanceThresh) & mask1;
 
     triangulatePoints(P0, P2, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[1]);
     Mat mask2 = Q.row(2).mul(Q.row(3)) > 0;
     Q.row(0) /= Q.row(3);
     Q.row(1) /= Q.row(3);
     Q.row(2) /= Q.row(3);
     Q.row(3) /= Q.row(3);
-    mask2 = (Q.row(2) < dist) & mask2;
+    mask2 = (Q.row(2) < distanceThresh) & mask2;
     Q = P2 * Q;
     mask2 = (Q.row(2) > 0) & mask2;
-    mask2 = (Q.row(2) < dist) & mask2;
+    mask2 = (Q.row(2) < distanceThresh) & mask2;
 
     triangulatePoints(P0, P3, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[2]);
     Mat mask3 = Q.row(2).mul(Q.row(3)) > 0;
     Q.row(0) /= Q.row(3);
     Q.row(1) /= Q.row(3);
     Q.row(2) /= Q.row(3);
     Q.row(3) /= Q.row(3);
-    mask3 = (Q.row(2) < dist) & mask3;
+    mask3 = (Q.row(2) < distanceThresh) & mask3;
     Q = P3 * Q;
     mask3 = (Q.row(2) > 0) & mask3;
-    mask3 = (Q.row(2) < dist) & mask3;
+    mask3 = (Q.row(2) < distanceThresh) & mask3;
 
     triangulatePoints(P0, P4, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[3]);
     Mat mask4 = Q.row(2).mul(Q.row(3)) > 0;
     Q.row(0) /= Q.row(3);
     Q.row(1) /= Q.row(3);
     Q.row(2) /= Q.row(3);
     Q.row(3) /= Q.row(3);
-    mask4 = (Q.row(2) < dist) & mask4;
+    mask4 = (Q.row(2) < distanceThresh) & mask4;
     Q = P4 * Q;
     mask4 = (Q.row(2) > 0) & mask4;
-    mask4 = (Q.row(2) < dist) & mask4;
+    mask4 = (Q.row(2) < distanceThresh) & mask4;
 
     mask1 = mask1.t();
     mask2 = mask2.t();
@@ -583,6 +593,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
 
     if (good1 >= good2 && good1 >= good3 && good1 >= good4)
     {
+        if(triangulatedPoints.needed()) allTriangulations[0].copyTo(triangulatedPoints);
         R1.copyTo(_R);
         t.copyTo(_t);
         if (_mask.needed()) mask1.copyTo(_mask);
@@ -590,6 +601,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
     }
     else if (good2 >= good1 && good2 >= good3 && good2 >= good4)
     {
+        if(triangulatedPoints.needed()) allTriangulations[1].copyTo(triangulatedPoints);
         R2.copyTo(_R);
         t.copyTo(_t);
         if (_mask.needed()) mask2.copyTo(_mask);
@@ -597,6 +609,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
     }
     else if (good3 >= good1 && good3 >= good2 && good3 >= good4)
     {
+        if(triangulatedPoints.needed()) allTriangulations[2].copyTo(triangulatedPoints);
         t = -t;
         R1.copyTo(_R);
         t.copyTo(_t);
@@ -605,6 +618,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
     }
     else
     {
+        if(triangulatedPoints.needed()) allTriangulations[3].copyTo(triangulatedPoints);
         t = -t;
         R2.copyTo(_R);
         t.copyTo(_t);
@@ -613,6 +627,12 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
     }
 }
 
+int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
+                     OutputArray _R, OutputArray _t, InputOutputArray _mask)
+{
+    return cv::recoverPose(E, _points1, _points2, _cameraMatrix, _R, _t, 50, _mask);
+}
+
 int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, OutputArray _R,
                      OutputArray _t, double focal, Point2d pp, InputOutputArray _mask)
 {
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 75b5cc5d57..5d18823d58 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -626,7 +626,7 @@ then pass the matrix to calcCovarMatrix .
 @param src input array that should have from 1 to 4 channels so that the results can be stored in
 Scalar_ 's.
 @param mean output parameter: calculated mean value.
-@param stddev output parameter: calculateded standard deviation.
+@param stddev output parameter: calculated standard deviation.
 @param mask optional operation mask.
 @sa  countNonZero, mean, norm, minMaxLoc, calcCovarMatrix
 */
@@ -1639,7 +1639,7 @@ CV_EXPORTS_W void mulTransposed( InputArray src, OutputArray dst, bool aTa,
 
 The function cv::transpose transposes the matrix src :
 \f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
-@note No complex conjugation is done in case of a complex matrix. It it
+@note No complex conjugation is done in case of a complex matrix. It
 should be done separately if needed.
 @param src input array.
 @param dst output array of the same type as src.
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index b319df6f38..07ca3a51a1 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -239,6 +239,10 @@ enum DftFlags {
         into a real array and inverse transformation is executed, the function treats the input as a
         packed complex-conjugate symmetrical array, and the output will also be a real array). */
     DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
     /** performs an inverse 1D or 2D transform instead of the default forward transform. */
     DCT_INVERSE        = DFT_INVERSE,
     /** performs a forward or inverse transform of every individual row of the input
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index c538392cbd..8856520c1a 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -327,6 +327,34 @@ The function does not reallocate memory if the matrix has proper attributes alre
  */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
 
+/** @brief BufferPool for use with CUDA streams
+
+ * BufferPool utilizes cuda::Stream's allocator to create new buffers. It is
+ * particularly useful when BufferPoolUsage is set to true, or a custom
+ * allocator is specified for the cuda::Stream, and you want to implement your
+ * own stream based functions utilizing the same underlying GPU memory
+ * management.
+ */
+class CV_EXPORTS BufferPool
+{
+public:
+
+    //! Gets the BufferPool for the given stream.
+    explicit BufferPool(Stream& stream);
+
+    //! Allocates a new GpuMat of given size and type.
+    GpuMat getBuffer(int rows, int cols, int type);
+
+    //! Allocates a new GpuMat of given size and type.
+    GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+
+    //! Returns the allocator associated with the stream.
+    Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
+
+private:
+    Ptr<GpuMat::Allocator> allocator_;
+};
+
 //! BufferPool management (must be called before Stream creation)
 CV_EXPORTS void setBufferPoolUsage(bool on);
 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
@@ -479,6 +507,9 @@ public:
     //! creates a new asynchronous stream
     Stream();
 
+    //! creates a new asynchronous stream with custom allocator
+    Stream(const Ptr<GpuMat::Allocator>& allocator);
+
     /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
     */
     bool queryIfComplete() const;
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
new file mode 100644
index 0000000000..9a8537f909
--- /dev/null
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined __OPENCV_BUILD \
+
+#include "cv_cpu_config.h"
+#include "cv_cpu_helper.h"
+
+#if defined CV_ENABLE_INTRINSICS \
+    && !defined CV_DISABLE_OPTIMIZATION \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
+
+#ifdef CV_CPU_COMPILE_SSE2
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE3
+#  include <pmmintrin.h>
+#  define CV_SSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSSE3
+#  include <tmmintrin.h>
+#  define CV_SSSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_1
+#  include <smmintrin.h>
+#  define CV_SSE4_1 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_2
+#  include <nmmintrin.h>
+#  define CV_SSE4_2 1
+#endif
+#ifdef CV_CPU_COMPILE_POPCNT
+#  ifdef _MSC_VER
+#    include <nmmintrin.h>
+#    if defined(_M_X64)
+#      define CV_POPCNT_U64 _mm_popcnt_u64
+#    endif
+#    define CV_POPCNT_U32 _mm_popcnt_u32
+#  else
+#    include <popcntintrin.h>
+#    if defined(__x86_64__)
+#      define CV_POPCNT_U64 __builtin_popcountll
+#    endif
+#    define CV_POPCNT_U32 __builtin_popcount
+#  endif
+#  define CV_POPCNT 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX
+#  include <immintrin.h>
+#  define CV_AVX 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX2
+#  include <immintrin.h>
+#  define CV_AVX2 1
+#endif
+#ifdef CV_CPU_COMPILE_FMA3
+#  define CV_FMA3 1
+#endif
+
+#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined(__ARM_NEON__) || defined(__aarch64__)
+#  include <arm_neon.h>
+#endif
+
+#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
+
+#endif // __OPENCV_BUILD
+
+
+
+#if !defined __OPENCV_BUILD // Compatibility code
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#elif (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#endif // !__OPENCV_BUILD (Compatibility code)
+
+
+
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_POPCNT
+#  define CV_POPCNT 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h
new file mode 100644
index 0000000000..cb755d615e
--- /dev/null
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -0,0 +1,133 @@
+// AUTOGENERATED, DO NOT EDIT
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
+#  define CV_CPU_HAS_SUPPORT_SSE 1
+#  define CV_CPU_CALL_SSE(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
+#  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
+#  define CV_CPU_CALL_SSE(...) if (CV_CPU_HAS_SUPPORT_SSE) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE 0
+#  define CV_CPU_CALL_SSE(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
+#  define CV_CPU_HAS_SUPPORT_SSE2 1
+#  define CV_CPU_CALL_SSE2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
+#  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
+#  define CV_CPU_CALL_SSE2(...) if (CV_CPU_HAS_SUPPORT_SSE2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE2 0
+#  define CV_CPU_CALL_SSE2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
+#  define CV_CPU_HAS_SUPPORT_SSE3 1
+#  define CV_CPU_CALL_SSE3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
+#  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
+#  define CV_CPU_CALL_SSE3(...) if (CV_CPU_HAS_SUPPORT_SSE3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE3 0
+#  define CV_CPU_CALL_SSE3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
+#  define CV_CPU_HAS_SUPPORT_SSSE3 1
+#  define CV_CPU_CALL_SSSE3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
+#  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
+#  define CV_CPU_CALL_SSSE3(...) if (CV_CPU_HAS_SUPPORT_SSSE3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSSE3 0
+#  define CV_CPU_CALL_SSSE3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 1
+#  define CV_CPU_CALL_SSE4_1(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
+#  define CV_CPU_CALL_SSE4_1(...) if (CV_CPU_HAS_SUPPORT_SSE4_1) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 0
+#  define CV_CPU_CALL_SSE4_1(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 1
+#  define CV_CPU_CALL_SSE4_2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#  define CV_CPU_CALL_SSE4_2(...) if (CV_CPU_HAS_SUPPORT_SSE4_2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 0
+#  define CV_CPU_CALL_SSE4_2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
+#  define CV_CPU_HAS_SUPPORT_POPCNT 1
+#  define CV_CPU_CALL_POPCNT(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
+#  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
+#  define CV_CPU_CALL_POPCNT(...) if (CV_CPU_HAS_SUPPORT_POPCNT) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_POPCNT 0
+#  define CV_CPU_CALL_POPCNT(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
+#  define CV_CPU_HAS_SUPPORT_AVX 1
+#  define CV_CPU_CALL_AVX(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
+#  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
+#  define CV_CPU_CALL_AVX(...) if (CV_CPU_HAS_SUPPORT_AVX) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_AVX 0
+#  define CV_CPU_CALL_AVX(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
+#  define CV_CPU_HAS_SUPPORT_FP16 1
+#  define CV_CPU_CALL_FP16(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
+#  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
+#  define CV_CPU_CALL_FP16(...) if (CV_CPU_HAS_SUPPORT_FP16) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_FP16 0
+#  define CV_CPU_CALL_FP16(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
+#  define CV_CPU_HAS_SUPPORT_AVX2 1
+#  define CV_CPU_CALL_AVX2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
+#  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
+#  define CV_CPU_CALL_AVX2(...) if (CV_CPU_HAS_SUPPORT_AVX2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_AVX2 0
+#  define CV_CPU_CALL_AVX2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
+#  define CV_CPU_HAS_SUPPORT_FMA3 1
+#  define CV_CPU_CALL_FMA3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
+#  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
+#  define CV_CPU_CALL_FMA3(...) if (CV_CPU_HAS_SUPPORT_FMA3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_FMA3 0
+#  define CV_CPU_CALL_FMA3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
+#  define CV_CPU_HAS_SUPPORT_NEON 1
+#  define CV_CPU_CALL_NEON(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
+#  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
+#  define CV_CPU_CALL_NEON(...) if (CV_CPU_HAS_SUPPORT_NEON) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_NEON 0
+#  define CV_CPU_CALL_NEON(...)
+#endif
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 91ebd20774..f2212b4217 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -48,6 +48,10 @@
 //! @addtogroup core_utils
 //! @{
 
+#ifdef __OPENCV_BUILD
+#include "cvconfig.h"
+#endif
+
 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
@@ -59,10 +63,6 @@
 #undef abs
 #undef Complex
 
-#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
-#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
-#endif
-
 #include <limits.h>
 #include "opencv2/core/hal/interface.h"
 
@@ -88,7 +88,7 @@
 #  endif
 #endif
 
-#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
+#if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
 #  define CV_ENABLE_UNROLLED 0
 #else
 #  define CV_ENABLE_UNROLLED 1
@@ -161,150 +161,9 @@ enum CpuFeatures {
     CPU_NEON            = 100
 };
 
-// do not include SSE/AVX/NEON headers for NVCC compiler
-#ifndef __CUDACC__
-
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  include <emmintrin.h>
-#  define CV_MMX 1
-#  define CV_SSE 1
-#  define CV_SSE2 1
-#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <pmmintrin.h>
-#    define CV_SSE3 1
-#  endif
-#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <tmmintrin.h>
-#    define CV_SSSE3 1
-#  endif
-#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <smmintrin.h>
-#    define CV_SSE4_1 1
-#  endif
-#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <nmmintrin.h>
-#    define CV_SSE4_2 1
-#  endif
-#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    ifdef _MSC_VER
-#      include <nmmintrin.h>
-#      if defined(_M_X64)
-#        define CV_POPCNT_U64 _mm_popcnt_u64
-#      endif
-#      define CV_POPCNT_U32 _mm_popcnt_u32
-#    else
-#      include <popcntintrin.h>
-#      if defined(__x86_64__)
-#        define CV_POPCNT_U64 __builtin_popcountll
-#      endif
-#      define CV_POPCNT_U32 __builtin_popcount
-#    endif
-#    define CV_POPCNT 1
-#  endif
-#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
-// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
-// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
-#    include <immintrin.h>
-#    define CV_AVX 1
-#    if defined(_XCR_XFEATURE_ENABLED_MASK)
-#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-#    else
-#      define __xgetbv() 0
-#    endif
-#  endif
-#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
-#    include <immintrin.h>
-#    define CV_AVX2 1
-#    if defined __FMA__
-#      define CV_FMA3 1
-#    endif
-#  endif
-#endif
-
-#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
-# include <Intrin.h>
-# include <arm_neon.h>
-# define CV_NEON 1
-# define CPU_HAS_NEON_FEATURE (true)
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
-#  include <arm_neon.h>
-#  define CV_NEON 1
-#endif
-
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
-#  define CV_VFP 1
-#endif
-
-#endif // __CUDACC__
-
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
-#ifndef CV_MMX
-#  define CV_MMX 0
-#endif
-#ifndef CV_SSE
-#  define CV_SSE 0
-#endif
-#ifndef CV_SSE2
-#  define CV_SSE2 0
-#endif
-#ifndef CV_SSE3
-#  define CV_SSE3 0
-#endif
-#ifndef CV_SSSE3
-#  define CV_SSSE3 0
-#endif
-#ifndef CV_SSE4_1
-#  define CV_SSE4_1 0
-#endif
-#ifndef CV_SSE4_2
-#  define CV_SSE4_2 0
-#endif
-#ifndef CV_AVX
-#  define CV_AVX 0
-#endif
-#ifndef CV_AVX2
-#  define CV_AVX2 0
-#endif
-#ifndef CV_FMA3
-#  define CV_FMA3 0
-#endif
-#ifndef CV_AVX_512F
-#  define CV_AVX_512F 0
-#endif
-#ifndef CV_AVX_512BW
-#  define CV_AVX_512BW 0
-#endif
-#ifndef CV_AVX_512CD
-#  define CV_AVX_512CD 0
-#endif
-#ifndef CV_AVX_512DQ
-#  define CV_AVX_512DQ 0
-#endif
-#ifndef CV_AVX_512ER
-#  define CV_AVX_512ER 0
-#endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
-#endif
-#ifndef CV_AVX_512PF
-#  define CV_AVX_512PF 0
-#endif
-#ifndef CV_AVX_512VBMI
-#  define CV_AVX_512VBMI 0
-#endif
-#ifndef CV_AVX_512VL
-#  define CV_AVX_512VL 0
-#endif
 
-#ifndef CV_NEON
-#  define CV_NEON 0
-#endif
+#include "cv_cpu_dispatch.h"
 
-#ifndef CV_VFP
-#  define CV_VFP 0
-#endif
 
 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index 92c2f350a8..8aae46d9d2 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -47,6 +47,12 @@
 
 #include "opencv2/core/cvdef.h"
 
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+#include <emmintrin.h>
+#endif
+
+
 //! @addtogroup core_utils
 //! @{
 
@@ -68,7 +74,7 @@
 #  include "tegra_round.hpp"
 #endif
 
-#if CV_VFP
+#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
     // 1. general scheme
     #define ARM_ROUND(_value, _asm_string) \
         int res; \
@@ -84,7 +90,7 @@
     #endif
     // 3. version for float
     #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
+#endif
 
 /** @brief Rounds floating-point number to the nearest integer
 
@@ -95,7 +101,7 @@ CV_INLINE int
 cvRound( double value )
 {
 #if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
     __m128d t = _mm_set_sd( value );
     return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -110,7 +116,7 @@ cvRound( double value )
         defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
     TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
+# if defined ARM_ROUND_DBL
     ARM_ROUND_DBL(value);
 # else
     return (int)lrint(value);
@@ -132,18 +138,8 @@ cvRound( double value )
  */
 CV_INLINE int cvFloor( double value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
     int i = (int)value;
     return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
 }
 
 /** @brief Rounds floating-point number to the nearest integer not smaller than the original.
@@ -155,18 +151,8 @@ CV_INLINE int cvFloor( double value )
  */
 CV_INLINE int cvCeil( double value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
     int i = (int)value;
     return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
 }
 
 /** @brief Determines if the argument is Not A Number.
@@ -202,8 +188,8 @@ CV_INLINE int cvIsInf( double value )
 /** @overload */
 CV_INLINE int cvRound(float value)
 {
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
-      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
     __m128 t = _mm_set_ss( value );
     return _mm_cvtss_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -218,7 +204,7 @@ CV_INLINE int cvRound(float value)
         defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
     TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
+# if defined ARM_ROUND_FLT
     ARM_ROUND_FLT(value);
 # else
     return (int)lrintf(value);
@@ -239,18 +225,8 @@ CV_INLINE int cvRound( int value )
 /** @overload */
 CV_INLINE int cvFloor( float value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
-#elif defined __GNUC__
     int i = (int)value;
     return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
 }
 
 /** @overload */
@@ -262,18 +238,8 @@ CV_INLINE int cvFloor( int value )
 /** @overload */
 CV_INLINE int cvCeil( float value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
-#elif defined __GNUC__
     int i = (int)value;
     return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
 }
 
 /** @overload */
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 0bd0ba53e4..e3a24032d0 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -73,8 +73,8 @@ It is defined as:
     typedef const _InputArray& InputArray;
 @endcode
 where _InputArray is a class that can be constructed from `Mat`, `Mat_<T>`, `Matx<T, m, n>`,
-`std::vector<T>`, `std::vector<std::vector<T> >` or `std::vector<Mat>`. It can also be constructed
-from a matrix expression.
+`std::vector<T>`, `std::vector<std::vector<T> >`, `std::vector<Mat>`, `std::vector<Mat_<T> >`,
+`UMat`, `std::vector<UMat>` or `double`. It can also be constructed from a matrix expression.
 
 Since this is mostly implementation-level class, and its interface may change in future versions, we
 do not describe it in details. There are a few key things, though, that should be kept in mind:
@@ -660,7 +660,7 @@ sub-matrices.
 
 - Use MATLAB-style array initializers, zeros(), ones(), eye(), for example:
 @code
-    // create a double-precision identity martix and add it to M.
+    // create a double-precision identity matrix and add it to M.
     M += Mat::eye(M.rows, M.cols, CV_64F);
 @endcode
 
@@ -693,7 +693,7 @@ If you need to process a whole row of a 2D array, the most efficient way is to g
 the row first, and then just use the plain C operator [] :
 @code
     // compute sum of positive matrix elements
-    // (assuming that M isa double-precision matrix)
+    // (assuming that M is a double-precision matrix)
     double sum=0;
     for(int i = 0; i < M.rows; i++)
     {
@@ -1085,6 +1085,29 @@ public:
       immediately below the main one.
     - `d>0` is a diagonal from the upper half. For example, d=1 means the diagonal is set
       immediately above the main one.
+    For example:
+    @code
+        Mat m = (Mat_<int>(3,3) <<
+                    1,2,3,
+                    4,5,6,
+                    7,8,9);
+        Mat d0 = m.diag(0);
+        Mat d1 = m.diag(1);
+        Mat d_1 = m.diag(-1);
+    @endcode
+    The resulting matrices are
+    @code
+     d0 =
+       [1;
+        5;
+        9]
+     d1 =
+       [2;
+        6]
+     d_1 =
+       [4;
+        8]
+    @endcode
      */
     Mat diag(int d=0) const;
 
@@ -2287,9 +2310,9 @@ public:
     UMat colRange(int startcol, int endcol) const;
     UMat colRange(const Range& r) const;
     //! ... for the specified diagonal
-    // (d=0 - the main diagonal,
-    //  >0 - a diagonal from the lower half,
-    //  <0 - a diagonal from the upper half)
+    //! (d=0 - the main diagonal,
+    //!  >0 - a diagonal from the upper half,
+    //!  <0 - a diagonal from the lower half)
     UMat diag(int d=0) const;
     //! constructs a square diagonal matrix which main diagonal is vector "d"
     static UMat diag(const UMat& d);
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 4a32de165a..b3b7110cbc 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -1634,14 +1634,14 @@ Mat_<_Tp> Mat_<_Tp>::operator()(const std::vector<Range>& ranges) const
 template<typename _Tp> inline
 _Tp* Mat_<_Tp>::operator [](int y)
 {
-    CV_DbgAssert( 0 <= y && y < rows );
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
     return (_Tp*)(data + y*step.p[0]);
 }
 
 template<typename _Tp> inline
 const _Tp* Mat_<_Tp>::operator [](int y) const
 {
-    CV_DbgAssert( 0 <= y && y < rows );
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
     return (const _Tp*)(data + y*step.p[0]);
 }
 
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 0d07c3f98b..56a2a93f1e 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -590,11 +590,12 @@ Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp
 template<typename _Tp, int m, int n> inline
 Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
 {
-    CV_StaticAssert(channels == 14, "Matx should have at least 14 elements.");
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
     val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
     val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
     val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
     val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
 }
 
 
diff --git a/modules/core/include/opencv2/core/persistence.hpp b/modules/core/include/opencv2/core/persistence.hpp
index 61d1d27fa0..3ead8af14b 100644
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@@ -1055,6 +1055,20 @@ void write(FileStorage& fs, const String& name, const Range& r )
     write(fs, r);
 }
 
+static inline
+void write(FileStorage& fs, const String& name, const KeyPoint& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const DMatch& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
 template<typename _Tp> static inline
 void write( FileStorage& fs, const String& name, const std::vector<_Tp>& vec )
 {
@@ -1245,6 +1259,14 @@ void operator >> (const FileNode& n, std::vector<KeyPoint>& vec)
 {
     read(n, vec);
 }
+
+static inline
+void operator >> (const FileNode& n, KeyPoint& kpt)
+{
+    FileNodeIterator it = n.begin();
+    it >> kpt.pt.x >> kpt.pt.y >> kpt.size >> kpt.angle >> kpt.response >> kpt.octave >> kpt.class_id;
+}
+
 /** @brief Reads DMatch from a file storage.
 */
 //It needs special handling because it contains two types of fields, int & float.
@@ -1254,6 +1276,13 @@ void operator >> (const FileNode& n, std::vector<DMatch>& vec)
     read(n, vec);
 }
 
+static inline
+void operator >> (const FileNode& n, DMatch& m)
+{
+    FileNodeIterator it = n.begin();
+    it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
+}
+
 //! @} FileNode
 
 //! @relates cv::FileNodeIterator
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 01a4ab3bf9..1214d70304 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -102,20 +102,6 @@ static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The
 
 namespace cv { namespace cuda
 {
-    class CV_EXPORTS BufferPool
-    {
-    public:
-        explicit BufferPool(Stream& stream);
-
-        GpuMat getBuffer(int rows, int cols, int type);
-        GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
-
-        GpuMat::Allocator* getAllocator() const { return allocator_; }
-
-    private:
-        GpuMat::Allocator* allocator_;
-    };
-
     static inline void checkNppError(int code, const char* file, const int line, const char* func)
     {
         if (code < 0)
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 93d599ccee..bc33f03f5d 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -507,7 +507,7 @@ void Mat::forEach_impl(const Functor& operation) {
                     this->rowCall2(row, COLS);
                 }
             } else {
-                std::vector<int> idx(COLS); /// idx is modified in this->rowCall
+                std::vector<int> idx(DIMS); /// idx is modified in this->rowCall
                 idx[DIMS - 2] = range.start - 1;
 
                 for (int line_num = range.start; line_num < range.end; ++line_num) {
diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp
index 1ea8df37b9..696771404c 100644
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -282,9 +282,10 @@ public:
     cudaStream_t stream;
     bool ownStream;
 
-    Ptr<StackAllocator> stackAllocator;
+    Ptr<GpuMat::Allocator> allocator;
 
     Impl();
+    Impl(const Ptr<GpuMat::Allocator>& allocator);
     explicit Impl(cudaStream_t stream);
 
     ~Impl();
@@ -295,17 +296,23 @@ cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
     cudaSafeCall( cudaStreamCreate(&stream) );
     ownStream = true;
 
-    stackAllocator = makePtr<StackAllocator>(stream);
+    allocator = makePtr<StackAllocator>(stream);
+}
+
+cv::cuda::Stream::Impl::Impl(const Ptr<GpuMat::Allocator>& allocator) : stream(0), ownStream(false), allocator(allocator)
+{
+    cudaSafeCall( cudaStreamCreate(&stream) );
+    ownStream = true;
 }
 
 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
 {
-    stackAllocator = makePtr<StackAllocator>(stream);
+    allocator = makePtr<StackAllocator>(stream);
 }
 
 cv::cuda::Stream::Impl::~Impl()
 {
-    stackAllocator.release();
+    allocator.release();
 
     if (stream && ownStream)
     {
@@ -417,6 +424,16 @@ cv::cuda::Stream::Stream()
 #endif
 }
 
+cv::cuda::Stream::Stream(const Ptr<GpuMat::Allocator>& allocator)
+{
+#ifndef HAVE_CUDA
+    (void) allocator;
+    throw_no_cuda();
+#else
+    impl_ = makePtr<Impl>(allocator);
+#endif
+}
+
 bool cv::cuda::Stream::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
@@ -668,20 +685,33 @@ void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCoun
 #endif
 }
 
-#ifdef HAVE_CUDA
-
-cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator.get())
+#ifndef HAVE_CUDA
+cv::cuda::BufferPool::BufferPool(Stream& stream)
+{
+    (void) stream;
+    throw_no_cuda();
+}
+#else
+cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->allocator)
 {
 }
+#endif
 
 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
 {
+#ifndef HAVE_CUDA
+    (void) rows;
+    (void) cols;
+    (void) type;
+    throw_no_cuda();
+    return GpuMat();
+#else
     GpuMat buf(allocator_);
     buf.create(rows, cols, type);
     return buf;
+#endif
 }
 
-#endif
 
 ////////////////////////////////////////////////////////////////
 // Event
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index f553c4f31e..e33b105ba6 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -3342,6 +3342,9 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 
     CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
 
+    // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.
+    CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );
+
     if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
         _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
     else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 135397aa7d..2e479f7c56 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1395,7 +1395,7 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
     {
         const std::vector<uchar>& v = *(const std::vector<uchar>*)obj;
 
-        size_t n = v.size(), esz = CV_ELEM_SIZE(flags);
+        size_t n = size().width, esz = CV_ELEM_SIZE(flags);
         int t = CV_MAT_DEPTH(flags), cn = CV_MAT_CN(flags);
         mv.resize(n);
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 33bed26c85..64314a2206 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -3434,7 +3434,7 @@ int Kernel::set(int i, const KernelArg& arg)
             if( !(arg.flags & KernelArg::NO_SIZE) )
             {
                 int cols = u3d.cols*arg.wscale/arg.iwscale;
-                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices) == CL_SUCCESS);
                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
                 i += 3;
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 69446beb2a..6a63e84ef6 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -272,8 +272,8 @@ struct CoreTLSData
 
     RNG rng;
 //#ifdef HAVE_OPENCL
-    int device;
-    ocl::Queue oclQueue;
+    int device; // device index of an array of devices in a context, see also Device::getDefault
+    ocl::Queue oclQueue; // the queue used for running a kernel, see also getQueue, Kernel::run
     int useOpenCL; // 1 - use, 0 - do not use, -1 - auto/not initialized
 //#endif
     int useIPP; // 1 - use, 0 - do not use, -1 - auto/not initialized
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 37a12411b3..e5402614eb 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -237,24 +237,81 @@ void Exception::formatMessage()
         msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
 }
 
+static const char* g_hwFeatureNames[CV_HARDWARE_MAX_FEATURE] = { NULL };
+
+static const char* getHWFeatureName(int id)
+{
+    return (id < CV_HARDWARE_MAX_FEATURE) ? g_hwFeatureNames[id] : NULL;
+}
+static const char* getHWFeatureNameSafe(int id)
+{
+    const char* name = getHWFeatureName(id);
+    return name ? name : "Unknown feature";
+}
+
 struct HWFeatures
 {
     enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
 
-    HWFeatures(void)
+    HWFeatures(bool run_initialize = false)
     {
-        memset( have, 0, sizeof(have) );
-        x86_family = 0;
+        memset( have, 0, sizeof(have[0]) * MAX_FEATURE );
+        if (run_initialize)
+            initialize();
     }
 
-    static HWFeatures initialize(void)
+    static void initializeNames()
     {
-        HWFeatures f;
+        for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
+        {
+            g_hwFeatureNames[i] = 0;
+        }
+        g_hwFeatureNames[CPU_MMX] = "MMX";
+        g_hwFeatureNames[CPU_SSE] = "SSE";
+        g_hwFeatureNames[CPU_SSE2] = "SSE2";
+        g_hwFeatureNames[CPU_SSE3] = "SSE3";
+        g_hwFeatureNames[CPU_SSSE3] = "SSSE3";
+        g_hwFeatureNames[CPU_SSE4_1] = "SSE4.1";
+        g_hwFeatureNames[CPU_SSE4_2] = "SSE4.2";
+        g_hwFeatureNames[CPU_POPCNT] = "POPCNT";
+        g_hwFeatureNames[CPU_FP16] = "FP16";
+        g_hwFeatureNames[CPU_AVX] = "AVX";
+        g_hwFeatureNames[CPU_AVX2] = "AVX2";
+        g_hwFeatureNames[CPU_FMA3] = "FMA3";
+
+        g_hwFeatureNames[CPU_AVX_512F] = "AVX512F";
+        g_hwFeatureNames[CPU_AVX_512BW] = "AVX512BW";
+        g_hwFeatureNames[CPU_AVX_512CD] = "AVX512CD";
+        g_hwFeatureNames[CPU_AVX_512DQ] = "AVX512DQ";
+        g_hwFeatureNames[CPU_AVX_512ER] = "AVX512ER";
+        g_hwFeatureNames[CPU_AVX_512IFMA512] = "AVX512IFMA";
+        g_hwFeatureNames[CPU_AVX_512PF] = "AVX512PF";
+        g_hwFeatureNames[CPU_AVX_512VBMI] = "AVX512VBMI";
+        g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL";
+
+        g_hwFeatureNames[CPU_NEON] = "NEON";
+    }
+
+    void initialize(void)
+    {
+#ifndef WINRT
+        if (getenv("OPENCV_DUMP_CONFIG"))
+        {
+            fprintf(stderr, "\nOpenCV build configuration is:\n%s\n",
+                cv::getBuildInformation().c_str());
+        }
+#endif
+
+        initializeNames();
+
         int cpuid_data[4] = { 0, 0, 0, 0 };
+        int cpuid_data_ex[4] = { 0, 0, 0, 0 };
 
     #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    #define OPENCV_HAVE_X86_CPUID 1
         __cpuid(cpuid_data, 1);
     #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    #define OPENCV_HAVE_X86_CPUID 1
         #ifdef __x86_64__
         asm __volatile__
         (
@@ -278,33 +335,36 @@ struct HWFeatures
         #endif
     #endif
 
-        f.x86_family = (cpuid_data[0] >> 8) & 15;
-        if( f.x86_family >= 6 )
+    #ifdef OPENCV_HAVE_X86_CPUID
+        int x86_family = (cpuid_data[0] >> 8) & 15;
+        if( x86_family >= 6 )
         {
-            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
-            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
-            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
-            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
-            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
-            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
-            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
-            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
-            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
-            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
-            f.have[CV_CPU_FP16]   = (cpuid_data[2] & (1<<29)) != 0;
+            have[CV_CPU_MMX]    = (cpuid_data[3] & (1<<23)) != 0;
+            have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
+            have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
+            have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
+            have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            have[CV_CPU_FMA3]   = (cpuid_data[2] & (1<<12)) != 0;
+            have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
+            have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
+            have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
+            have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
+            have[CV_CPU_FP16]   = (cpuid_data[2] & (1<<29)) != 0;
 
             // make the second call to the cpuid command in order to get
             // information about extended features like AVX2
         #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-            __cpuidex(cpuid_data, 7, 0);
+        #define OPENCV_HAVE_X86_CPUID_EX 1
+            __cpuidex(cpuid_data_ex, 7, 0);
         #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+        #define OPENCV_HAVE_X86_CPUID_EX 1
             #ifdef __x86_64__
             asm __volatile__
             (
              "movl $7, %%eax\n\t"
              "movl $0, %%ecx\n\t"
              "cpuid\n\t"
-             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :[eax]"=a"(cpuid_data_ex[0]),[ebx]"=b"(cpuid_data_ex[1]),[ecx]"=c"(cpuid_data_ex[2]),[edx]"=d"(cpuid_data_ex[3])
              :
              : "cc"
             );
@@ -317,29 +377,76 @@ struct HWFeatures
              "cpuid\n\t"
              "movl %%ebx, %0\n\t"
              "popl %%ebx\n\t"
-             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
+             : "=r"(cpuid_data_ex[1]), "=c"(cpuid_data_ex[2])
              :
              : "cc"
             );
             #endif
         #endif
-            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
-
-            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
-            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
-            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
-            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
-            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
-            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
-            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
-            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
-            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
+
+        #ifdef OPENCV_HAVE_X86_CPUID_EX
+            have[CV_CPU_AVX2]   = (cpuid_data_ex[1] & (1<<5)) != 0;
+
+            have[CV_CPU_AVX_512F]       = (cpuid_data_ex[1] & (1<<16)) != 0;
+            have[CV_CPU_AVX_512DQ]      = (cpuid_data_ex[1] & (1<<17)) != 0;
+            have[CV_CPU_AVX_512IFMA512] = (cpuid_data_ex[1] & (1<<21)) != 0;
+            have[CV_CPU_AVX_512PF]      = (cpuid_data_ex[1] & (1<<26)) != 0;
+            have[CV_CPU_AVX_512ER]      = (cpuid_data_ex[1] & (1<<27)) != 0;
+            have[CV_CPU_AVX_512CD]      = (cpuid_data_ex[1] & (1<<28)) != 0;
+            have[CV_CPU_AVX_512BW]      = (cpuid_data_ex[1] & (1<<30)) != 0;
+            have[CV_CPU_AVX_512VL]      = (cpuid_data_ex[1] & (1<<31)) != 0;
+            have[CV_CPU_AVX_512VBMI]    = (cpuid_data_ex[2] & (1<<1)) != 0;
+        #else
+            CV_UNUSED(cpuid_data_ex);
+        #endif
+
+            bool have_AVX_OS_support = true;
+            bool have_AVX512_OS_support = true;
+            if (!(cpuid_data[2] & (1<<27)))
+                have_AVX_OS_support = false; // OS uses XSAVE_XRSTORE and CPU support AVX
+            else
+            {
+                int xcr0 = 0;
+            #ifdef _XCR_XFEATURE_ENABLED_MASK // requires immintrin.h
+                xcr0 = (int)_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+            #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+                __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+            #endif
+                if ((xcr0 & 0x6) != 0x6)
+                    have_AVX_OS_support = false; // YMM registers
+                if ((xcr0 & 0xe6) != 0xe6)
+                    have_AVX512_OS_support = false; // ZMM registers
+            }
+
+            if (!have_AVX_OS_support)
+            {
+                have[CV_CPU_AVX] = false;
+                have[CV_CPU_FP16] = false;
+                have[CV_CPU_AVX2] = false;
+                have[CV_CPU_FMA3] = false;
+            }
+            if (!have_AVX_OS_support || !have_AVX512_OS_support)
+            {
+                have[CV_CPU_AVX_512F] = false;
+                have[CV_CPU_AVX_512BW] = false;
+                have[CV_CPU_AVX_512CD] = false;
+                have[CV_CPU_AVX_512DQ] = false;
+                have[CV_CPU_AVX_512ER] = false;
+                have[CV_CPU_AVX_512IFMA512] = false;
+                have[CV_CPU_AVX_512PF] = false;
+                have[CV_CPU_AVX_512VBMI] = false;
+                have[CV_CPU_AVX_512VL] = false;
+            }
         }
+    #else
+        CV_UNUSED(cpuid_data);
+        CV_UNUSED(cpuid_data_ex);
+    #endif // OPENCV_HAVE_X86_CPUID
 
     #if defined ANDROID || defined __linux__
     #ifdef __aarch64__
-        f.have[CV_CPU_NEON] = true;
-        f.have[CV_CPU_FP16] = true;
+        have[CV_CPU_NEON] = true;
+        have[CV_CPU_FP16] = true;
     #elif defined __arm__
         int cpufile = open("/proc/self/auxv", O_RDONLY);
 
@@ -352,8 +459,8 @@ struct HWFeatures
             {
                 if (auxv.a_type == AT_HWCAP)
                 {
-                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
-                    f.have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
+                    have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
                     break;
                 }
             }
@@ -363,21 +470,133 @@ struct HWFeatures
     #endif
     #elif (defined __clang__ || defined __APPLE__)
     #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
-        f.have[CV_CPU_NEON] = true;
+        have[CV_CPU_NEON] = true;
     #endif
     #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
-        f.have[CV_CPU_FP16] = true;
+        have[CV_CPU_FP16] = true;
     #endif
     #endif
 
-        return f;
+        int baseline_features[] = { CV_CPU_BASELINE_FEATURES };
+        if (!checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0])))
+        {
+            fprintf(stderr, "\n"
+                    "******************************************************************\n"
+                    "* FATAL ERROR:                                                   *\n"
+                    "* This OpenCV build doesn't support current CPU/HW configuration *\n"
+                    "*                                                                *\n"
+                    "* Use OPENCV_DUMP_CONFIG=1 environment variable for details      *\n"
+                    "******************************************************************\n");
+            fprintf(stderr, "\nRequired baseline features:\n");
+            checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0]), true);
+            CV_ErrorNoReturn(cv::Error::StsAssert, "Missing support for required CPU baseline features. Check OpenCV build configuration and required CPU/HW setup.");
+        }
+
+        readSettings(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0]));
+    }
+
+    bool checkFeatures(const int* features, int count, bool dump = false)
+    {
+        bool result = true;
+        for (int i = 0; i < count; i++)
+        {
+            int feature = features[i];
+            if (feature)
+            {
+                if (have[feature])
+                {
+                    if (dump) fprintf(stderr, "%s - OK\n", getHWFeatureNameSafe(feature));
+                }
+                else
+                {
+                    result = false;
+                    if (dump) fprintf(stderr, "%s - NOT AVAILABLE\n", getHWFeatureNameSafe(feature));
+                }
+            }
+        }
+        return result;
+    }
+
+    static inline bool isSymbolSeparator(char c)
+    {
+        return c == ',' || c == ';' || c == '-';
+    }
+
+    void readSettings(const int* baseline_features, int baseline_count)
+    {
+        bool dump = true;
+        const char* disabled_features =
+#ifndef WINRT
+                getenv("OPENCV_CPU_DISABLE");
+#else
+                NULL;
+#endif
+        if (disabled_features && disabled_features[0] != 0)
+        {
+            const char* start = disabled_features;
+            for (;;)
+            {
+                while (start[0] != 0 && isSymbolSeparator(start[0]))
+                {
+                    start++;
+                }
+                if (start[0] == 0)
+                    break;
+                const char* end = start;
+                while (end[0] != 0 && !isSymbolSeparator(end[0]))
+                {
+                    end++;
+                }
+                if (end == start)
+                    continue;
+                cv::String feature(start, end);
+                start = end;
+
+                CV_Assert(feature.size() > 0);
+
+                bool found = false;
+                for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
+                {
+                    if (!g_hwFeatureNames[i]) continue;
+                    size_t len = strlen(g_hwFeatureNames[i]);
+                    if (len != feature.size()) continue;
+                    if (feature.compare(g_hwFeatureNames[i]) == 0)
+                    {
+                        bool isBaseline = false;
+                        for (int k = 0; k < baseline_count; k++)
+                        {
+                            if (baseline_features[k] == i)
+                            {
+                                isBaseline = true;
+                                break;
+                            }
+                        }
+                        if (isBaseline)
+                        {
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable baseline CPU feature: '%s'. This has very limited effect, because code optimizations for this feature are executed unconditionally in the most cases.\n", getHWFeatureNameSafe(i));
+                        }
+                        if (!have[i])
+                        {
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable unavailable CPU feature on the current platform: '%s'.\n", getHWFeatureNameSafe(i));
+                        }
+                        have[i] = false;
+
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found)
+                {
+                    if (dump) fprintf(stderr, "OPENCV: Trying to disable unknown CPU feature: '%s'.\n", feature.c_str());
+                }
+            }
+        }
     }
 
-    int x86_family;
     bool have[MAX_FEATURE+1];
 };
 
-static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
+static HWFeatures  featuresEnabled(true), featuresDisabled = HWFeatures(false);
 static HWFeatures* currentFeatures = &featuresEnabled;
 
 bool checkHardwareSupport(int feature)
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 910d6d43a3..33af4c8b76 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -1014,7 +1014,7 @@ TEST(Core_InputOutput, filestorage_yaml_advanvced_type_heading)
     ASSERT_EQ(cv::norm(inputMatrix, actualMatrix, NORM_INF), 0.);
 }
 
-TEST(Core_InputOutput, filestorage_keypoints_io)
+TEST(Core_InputOutput, filestorage_keypoints_vec_vec_io)
 {
     vector<vector<KeyPoint> > kptsVec;
     vector<KeyPoint> kpts;
@@ -1051,36 +1051,111 @@ TEST(Core_InputOutput, filestorage_keypoints_io)
     }
 }
 
-TEST(Core_InputOutput, filestorage_dmatch_io)
+TEST(Core_InputOutput, FileStorage_DMatch)
 {
-    vector<vector<DMatch> > matchesVec;
-    vector<DMatch> matches;
-    matches.push_back(DMatch(1, 0, 10, 11.5f));
-    matches.push_back(DMatch(2, 1, 11, 21.5f));
-    matchesVec.push_back(matches);
-    matches.clear();
-    matches.push_back(DMatch(22, 10, 1, 1.5f));
-    matchesVec.push_back(matches);
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
 
-    FileStorage writer("", FileStorage::WRITE + FileStorage::MEMORY + FileStorage::FORMAT_XML);
-    writer << "dmatches" << matchesVec;
-    String content = writer.releaseAndGetString();
+    cv::DMatch d(1, 2, 3, -1.5f);
 
-    FileStorage reader(content, FileStorage::READ + FileStorage::MEMORY);
-    vector<vector<DMatch> > readKptsVec;
-    reader["dmatches"] >> readKptsVec;
+    EXPECT_NO_THROW(fs << "d" << d);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(), "%YAML:1.0\n---\nd: [ 1, 2, 3, -1.5000000000000000e+00 ]\n");
+
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    cv::DMatch d_read;
+    ASSERT_NO_THROW(fs_read["d"] >> d_read);
+
+    EXPECT_EQ(d.queryIdx, d_read.queryIdx);
+    EXPECT_EQ(d.trainIdx, d_read.trainIdx);
+    EXPECT_EQ(d.imgIdx, d_read.imgIdx);
+    EXPECT_EQ(d.distance, d_read.distance);
+}
 
-    ASSERT_EQ(matchesVec.size(), readKptsVec.size());
+TEST(Core_InputOutput, FileStorage_DMatch_vector)
+{
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+
+    cv::DMatch d1(1, 2, 3, -1.5f);
+    cv::DMatch d2(2, 3, 4, 1.5f);
+    cv::DMatch d3(3, 2, 1, 0.5f);
+    std::vector<cv::DMatch> dv;
+    dv.push_back(d1);
+    dv.push_back(d2);
+    dv.push_back(d3);
+
+    EXPECT_NO_THROW(fs << "dv" << dv);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(),
+"%YAML:1.0\n"
+"---\n"
+"dv: [ 1, 2, 3, -1.5000000000000000e+00, 2, 3, 4, 1.5000000000000000e+00,\n"
+"    3, 2, 1, 5.0000000000000000e-01 ]\n"
+);
+
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    std::vector<cv::DMatch> dv_read;
+    ASSERT_NO_THROW(fs_read["dv"] >> dv_read);
+
+    ASSERT_EQ(dv.size(), dv_read.size());
+    for (size_t i = 0; i < dv.size(); i++)
+    {
+        EXPECT_EQ(dv[i].queryIdx, dv_read[i].queryIdx);
+        EXPECT_EQ(dv[i].trainIdx, dv_read[i].trainIdx);
+        EXPECT_EQ(dv[i].imgIdx, dv_read[i].imgIdx);
+        EXPECT_EQ(dv[i].distance, dv_read[i].distance);
+    }
+}
 
-    for(size_t i = 0; i < matchesVec.size(); i++)
+TEST(Core_InputOutput, FileStorage_DMatch_vector_vector)
+{
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+
+    cv::DMatch d1(1, 2, 3, -1.5f);
+    cv::DMatch d2(2, 3, 4, 1.5f);
+    cv::DMatch d3(3, 2, 1, 0.5f);
+    std::vector<cv::DMatch> dv1;
+    dv1.push_back(d1);
+    dv1.push_back(d2);
+    dv1.push_back(d3);
+
+    std::vector<cv::DMatch> dv2;
+    dv2.push_back(d3);
+    dv2.push_back(d1);
+
+    std::vector< std::vector<cv::DMatch> > dvv;
+    dvv.push_back(dv1);
+    dvv.push_back(dv2);
+
+    EXPECT_NO_THROW(fs << "dvv" << dvv);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(),
+"%YAML:1.0\n"
+"---\n"
+"dvv:\n"
+"   - [ 1, 2, 3, -1.5000000000000000e+00, 2, 3, 4, 1.5000000000000000e+00,\n"
+"       3, 2, 1, 5.0000000000000000e-01 ]\n"
+"   - [ 3, 2, 1, 5.0000000000000000e-01, 1, 2, 3, -1.5000000000000000e+00 ]\n"
+);
+
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    std::vector< std::vector<cv::DMatch> > dvv_read;
+    ASSERT_NO_THROW(fs_read["dvv"] >> dvv_read);
+
+    ASSERT_EQ(dvv.size(), dvv_read.size());
+    for (size_t j = 0; j < dvv.size(); j++)
     {
-        ASSERT_EQ(matchesVec[i].size(), readKptsVec[i].size());
-        for(size_t j = 0; j < matchesVec[i].size(); j++)
+        const std::vector<cv::DMatch>& dv = dvv[j];
+        const std::vector<cv::DMatch>& dv_read = dvv_read[j];
+        ASSERT_EQ(dvv.size(), dvv_read.size());
+        for (size_t i = 0; i < dv.size(); i++)
         {
-            ASSERT_FLOAT_EQ(matchesVec[i][j].distance, readKptsVec[i][j].distance);
-            ASSERT_EQ(matchesVec[i][j].imgIdx, readKptsVec[i][j].imgIdx);
-            ASSERT_EQ(matchesVec[i][j].queryIdx, readKptsVec[i][j].queryIdx);
-            ASSERT_EQ(matchesVec[i][j].trainIdx, readKptsVec[i][j].trainIdx);
+            EXPECT_EQ(dv[i].queryIdx, dv_read[i].queryIdx);
+            EXPECT_EQ(dv[i].trainIdx, dv_read[i].trainIdx);
+            EXPECT_EQ(dv[i].imgIdx, dv_read[i].imgIdx);
+            EXPECT_EQ(dv[i].distance, dv_read[i].distance);
         }
     }
 }
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 9a379f468d..98ce812ee9 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -659,6 +659,18 @@ struct InitializerFunctor{
     }
 };
 
+template<typename Pixel>
+struct InitializerFunctor5D{
+    /// Initializer for cv::Mat::forEach test (5 dimensional case)
+    void operator()(Pixel & pixel, const int * idx) const {
+        pixel[0] = idx[0];
+        pixel[1] = idx[1];
+        pixel[2] = idx[2];
+        pixel[3] = idx[3];
+        pixel[4] = idx[4];
+    }
+};
+
 void Core_ArrayOpTest::run( int /* start_from */)
 {
     int errcount = 0;
@@ -736,6 +748,57 @@ void Core_ArrayOpTest::run( int /* start_from */)
         }
     }
 
+    // test cv::Mat::forEach
+    // with a matrix that has more dimensions than columns
+    // See https://github.com/opencv/opencv/issues/8447
+    {
+        const int dims[5] = { 2, 2, 2, 2, 2 };
+        typedef cv::Vec<int, 5> Pixel;
+
+        cv::Mat a = cv::Mat::zeros(5, dims, CV_32SC(5));
+        InitializerFunctor5D<Pixel> initializer;
+
+        a.forEach<Pixel>(initializer);
+
+        uint64 total = 0;
+        bool error_reported = false;
+        for (int i0 = 0; i0 < dims[0]; ++i0) {
+            for (int i1 = 0; i1 < dims[1]; ++i1) {
+                for (int i2 = 0; i2 < dims[2]; ++i2) {
+                    for (int i3 = 0; i3 < dims[3]; ++i3) {
+                        for (int i4 = 0; i4 < dims[4]; ++i4) {
+                            const int i[5] = { i0, i1, i2, i3, i4 };
+                            Pixel& pixel = a.at<Pixel>(i);
+                            if (pixel[0] != i0 || pixel[1] != i1 || pixel[2] != i2 || pixel[3] != i3 || pixel[4] != i4) {
+                                if (!error_reported) {
+                                    ts->printf(cvtest::TS::LOG, "forEach is not correct.\n"
+                                        "First error detected at position (%d, %d, %d, %d, %d), got value (%d, %d, %d, %d, %d).\n",
+                                        i0, i1, i2, i3, i4,
+                                        pixel[0], pixel[1], pixel[2], pixel[3], pixel[4]);
+                                    error_reported = true;
+                                }
+                                errcount++;
+                            }
+                            total += pixel[0];
+                            total += pixel[1];
+                            total += pixel[2];
+                            total += pixel[3];
+                            total += pixel[4];
+                        }
+                    }
+                }
+            }
+        }
+        uint64 total2 = 0;
+        for (size_t i = 0; i < sizeof(dims) / sizeof(dims[0]); ++i) {
+            total2 += ((dims[i] - 1) * dims[i] / 2) * dims[0] * dims[1] * dims[2] * dims[3] * dims[4] / dims[i];
+        }
+        if (total != total2) {
+            ts->printf(cvtest::TS::LOG, "forEach is not correct because total is invalid.\n");
+            errcount++;
+        }
+    }
+
     RNG rng;
     const int MAX_DIM = 5, MAX_DIM_SZ = 10;
     // sparse matrix operations
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index f2ee84543f..a482b49fcf 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -788,6 +788,7 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr
 (obtained from dft_size ).
 -   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
 cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that input is complex input with 2 channels.
 -   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
 real-complex transform, so the destination matrix must be real.
 @param stream Stream for the asynchronous version.
@@ -813,6 +814,35 @@ instead of the width.
  */
 CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
 
+/** @brief Base class for DFT operator as a cv::Algorithm. :
+ */
+class CV_EXPORTS DFT : public Algorithm
+{
+public:
+    /** @brief Computes an FFT of a given image.
+
+    @param image Source image. Only CV_32FC1 images are supported for now.
+    @param result Result image.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void compute(InputArray image, OutputArray result, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::DFT.
+
+@param dft_size The image size.
+@param flags Optional flags:
+-   **DFT_ROWS** transforms each individual row of the source matrix.
+-   **DFT_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft_size ).
+-   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that inputs will be complex with 2 channels.
+-   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+ */
+CV_EXPORTS Ptr<DFT> createDFT(Size dft_size, int flags);
+
 /** @brief Base class for convolution (or cross-correlation) operator. :
  */
 class CV_EXPORTS Convolution : public Algorithm
diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp
index 08de4e4288..01a0169136 100644
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -286,111 +286,146 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// dft
+// DFT function
 
 void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
 {
-#ifndef HAVE_CUFFT
-    (void) _src;
-    (void) _dst;
-    (void) dft_size;
-    (void) flags;
-    (void) stream;
-    throw_no_cuda();
-#else
-    GpuMat src = getInputMat(_src, stream);
+    if (getInputMat(_src, stream).channels() == 2)
+        flags |= DFT_COMPLEX_INPUT;
 
-    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+    Ptr<DFT> dft = createDFT(dft_size, flags);
+    dft->compute(_src, _dst, stream);
+}
 
-    // We don't support unpacked output (in the case of real input)
-    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
+//////////////////////////////////////////////////////////////////////////////
+// DFT algorithm
 
-    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
-    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
-    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
-    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
-    const bool is_complex_input  = src.channels() == 2;
-    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
+#ifdef HAVE_CUFFT
 
-    // We don't support real-to-real transform
-    CV_Assert( is_complex_input || is_complex_output );
+namespace
+{
 
-    // Make sure here we work with the continuous input,
-    // as CUFFT can't handle gaps
-    GpuMat src_cont;
-    if (src.isContinuous())
+    class DFTImpl : public DFT
     {
-        src_cont = src;
-    }
-    else
-    {
-        BufferPool pool(stream);
-        src_cont.allocator = pool.getAllocator();
-        createContinuous(src.rows, src.cols, src.type(), src_cont);
-        src.copyTo(src_cont, stream);
-    }
+        Size dft_size, dft_size_opt;
+        bool is_1d_input, is_row_dft, is_scaled_dft, is_inverse, is_complex_input, is_complex_output;
 
-    Size dft_size_opt = dft_size;
-    if (is_1d_input && !is_row_dft)
-    {
-        // If the source matrix is single column handle it as single row
-        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
-        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
-    }
+        cufftType dft_type;
+        cufftHandle plan;
 
-    CV_Assert( dft_size_opt.width > 1 );
+    public:
+        DFTImpl(Size dft_size, int flags)
+            : dft_size(dft_size),
+              dft_size_opt(dft_size),
+              is_1d_input((dft_size.height == 1) || (dft_size.width == 1)),
+              is_row_dft((flags & DFT_ROWS) != 0),
+              is_scaled_dft((flags & DFT_SCALE) != 0),
+              is_inverse((flags & DFT_INVERSE) != 0),
+              is_complex_input((flags & DFT_COMPLEX_INPUT) != 0),
+              is_complex_output(!(flags & DFT_REAL_OUTPUT)),
+              dft_type(!is_complex_input ? CUFFT_R2C : (is_complex_output ? CUFFT_C2C : CUFFT_C2R))
+        {
+            // We don't support unpacked output (in the case of real input)
+            CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
 
-    cufftType dft_type = CUFFT_R2C;
-    if (is_complex_input)
-        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
+            // We don't support real-to-real transform
+            CV_Assert( is_complex_input || is_complex_output );
 
-    cufftHandle plan;
-    if (is_1d_input || is_row_dft)
-        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
-    else
-        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+            if (is_1d_input && !is_row_dft)
+            {
+                // If the source matrix is single column handle it as single row
+                dft_size_opt.width = std::max(dft_size.width, dft_size.height);
+                dft_size_opt.height = std::min(dft_size.width, dft_size.height);
+            }
 
-    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+            CV_Assert( dft_size_opt.width > 1 );
 
-    if (is_complex_input)
-    {
-        if (is_complex_output)
-        {
-            createContinuous(dft_size, CV_32FC2, _dst);
-            GpuMat dst = _dst.getGpuMat();
+            if (is_1d_input || is_row_dft)
+                cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
+            else
+                cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+        }
 
-            cufftSafeCall(cufftExecC2C(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
-                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+        ~DFTImpl()
+        {
+            cufftSafeCall( cufftDestroy(plan) );
         }
-        else
+
+        void compute(InputArray _src, OutputArray _dst, Stream& stream)
         {
-            createContinuous(dft_size, CV_32F, _dst);
-            GpuMat dst = _dst.getGpuMat();
+            GpuMat src = getInputMat(_src, stream);
 
-            cufftSafeCall(cufftExecC2R(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
-        }
-    }
-    else
-    {
-        // We could swap dft_size for efficiency. Here we must reflect it
-        if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
-        else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+            CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+            CV_Assert( is_complex_input == (src.channels() == 2) );
 
-        GpuMat dst = _dst.getGpuMat();
+            // Make sure here we work with the continuous input,
+            // as CUFFT can't handle gaps
+            GpuMat src_cont;
+            if (src.isContinuous())
+            {
+                src_cont = src;
+            }
+            else
+            {
+                BufferPool pool(stream);
+                src_cont.allocator = pool.getAllocator();
+                createContinuous(src.rows, src.cols, src.type(), src_cont);
+                src.copyTo(src_cont, stream);
+            }
 
-        cufftSafeCall(cufftExecR2C(
-                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
-    }
+            cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
 
-    cufftSafeCall( cufftDestroy(plan) );
+            if (is_complex_input)
+            {
+                if (is_complex_output)
+                {
+                    createContinuous(dft_size, CV_32FC2, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2C(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                            is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+                }
+                else
+                {
+                    createContinuous(dft_size, CV_32F, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2R(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                }
+            }
+            else
+            {
+                // We could swap dft_size for efficiency. Here we must reflect it
+                if (dft_size == dft_size_opt)
+                    createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
+                else
+                    createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
 
-    if (is_scaled_dft)
-        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+                GpuMat dst = _dst.getGpuMat();
 
+                cufftSafeCall(cufftExecR2C(
+                                  plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+            }
+
+            if (is_scaled_dft)
+                cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+        }
+    };
+}
+
+#endif
+
+Ptr<DFT> cv::cuda::createDFT(Size dft_size, int flags)
+{
+#ifndef HAVE_CUFFT
+    (void) dft_size;
+    (void) flags;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<DFT>();
+#else
+    return makePtr<DFTImpl>(dft_size, flags);
 #endif
 }
 
diff --git a/modules/cudaarithm/test/test_arithm.cpp b/modules/cudaarithm/test/test_arithm.cpp
index 257f5233cc..3e99ed4f3f 100644
--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
@@ -250,6 +250,33 @@ CUDA_TEST_P(Dft, C2C)
     }
 }
 
+CUDA_TEST_P(Dft, Algorithm)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    int flags = 0;
+    cv::Ptr<cv::cuda::DFT> dft = cv::cuda::createDFT(cv::Size(cols, rows), flags);
+
+    for (int i = 0; i < 5; ++i)
+    {
+        SCOPED_TRACE("dft algorithm");
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        dft->compute(loadMat(a), d_b);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
 namespace
 {
     void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index dc876b744b..25a324a00f 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -201,6 +201,15 @@ CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int
  */
 CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());
 
+/** @brief Calculates histogram for one channel 8-bit image confined in given mask.
+
+@param src Source image with CV_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
+@param mask A mask image same size as src and of type CV_8UC1.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void calcHist(InputArray src, InputArray mask, OutputArray hist, Stream& stream = Stream::Null());
+
 /** @brief Equalizes the histogram of a grayscale image.
 
 @param src Source image with CV_8UC1 type.
diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
index ba9290c190..be13091f12 100644
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -105,6 +105,72 @@ namespace hist
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
+
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
+            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+                unsigned int m = maskRowPtr[x];
+
+                if ((m >>  0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+
+                if ((m >>  8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+
+                if ((m >>  16) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+
+                if ((m >>  24) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    unsigned int m = ((const uchar*)maskRowPtr)[x];
+
+                    if (m)
+                        Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index 59aa83343a..fce5057590 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -69,20 +69,32 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
 namespace hist
 {
     void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
 }
 
 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
+{
+    calcHist(_src, cv::cuda::GpuMat(), _hist, stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, Stream& stream)
 {
     GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
     CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.size() == src.size() );
 
     _hist.create(1, 256, CV_32SC1);
     GpuMat hist = _hist.getGpuMat();
 
     hist.setTo(Scalar::all(0), stream);
 
-    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    if (mask.empty())
+        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    else
+        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index 3d3217375e..7fcde85206 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -136,6 +136,49 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES));
 
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CalcHistWithMask, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat mask = randomMat(size, CV_8UC1);
+    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+
+    cv::Mat hist_gold;
+
+    const int hbins = 256;
+    const float hranges[] = {0.0f, 256.0f};
+    const int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    const int channels[] = {0};
+
+    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    hist_gold = hist_gold.reshape(1, 1);
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist
 
diff --git a/modules/features2d/CMakeLists.txt b/modules/features2d/CMakeLists.txt
index bf7d66e433..caae24af04 100644
--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@@ -1,2 +1,2 @@
 set(the_description "2D Features Framework")
-ocv_define_module(features2d opencv_imgproc opencv_ml opencv_flann OPTIONAL opencv_highgui WRAP java python)
+ocv_define_module(features2d opencv_imgproc opencv_flann OPTIONAL opencv_highgui WRAP java python)
diff --git a/modules/features2d/test/test_precomp.hpp b/modules/features2d/test/test_precomp.hpp
index 893b29b69b..bce72f7296 100644
--- a/modules/features2d/test/test_precomp.hpp
+++ b/modules/features2d/test/test_precomp.hpp
@@ -13,7 +13,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/ml.hpp"
 #include <iostream>
 
 #endif
diff --git a/modules/flann/include/opencv2/flann/kdtree_index.h b/modules/flann/include/opencv2/flann/kdtree_index.h
index dc0971c9ef..3f6ee01740 100644
--- a/modules/flann/include/opencv2/flann/kdtree_index.h
+++ b/modules/flann/include/opencv2/flann/kdtree_index.h
@@ -125,7 +125,12 @@ public:
         /* Construct the randomized trees. */
         for (int i = 0; i < trees_; i++) {
             /* Randomize the order of vectors to allow for unbiased sampling. */
+#ifndef OPENCV_FLANN_USE_STD_RAND
+            cv::randShuffle(vind_);
+#else
             std::random_shuffle(vind_.begin(), vind_.end());
+#endif
+
             tree_roots_[i] = divideTree(&vind_[0], int(size_) );
         }
     }
diff --git a/modules/flann/include/opencv2/flann/lsh_table.h b/modules/flann/include/opencv2/flann/lsh_table.h
index 8ef2bd3810..2a52fbcf7e 100644
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@@ -350,7 +350,11 @@ inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int
     // A bit brutal but fast to code
     std::vector<size_t> indices(feature_size * CHAR_BIT);
     for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::randShuffle(indices);
+#else
     std::random_shuffle(indices.begin(), indices.end());
+#endif
 
     // Generate a random set of order of subsignature_size_ bits
     for (unsigned int i = 0; i < key_size_; ++i) {
diff --git a/modules/flann/include/opencv2/flann/random.h b/modules/flann/include/opencv2/flann/random.h
index a3cf5ec53d..d6784747c0 100644
--- a/modules/flann/include/opencv2/flann/random.h
+++ b/modules/flann/include/opencv2/flann/random.h
@@ -40,13 +40,31 @@
 namespace cvflann
 {
 
+inline int rand()
+{
+#ifndef OPENCV_FLANN_USE_STD_RAND
+#   if INT_MAX == RAND_MAX
+    int v = cv::theRNG().next() & INT_MAX;
+#   else
+    int v = cv::theRNG().uniform(0, RAND_MAX + 1);
+#   endif
+#else
+    int v = std::rand();
+#endif // OPENCV_FLANN_USE_STD_RAND
+    return v;
+}
+
 /**
  * Seeds the random number generator
  *  @param seed Random seed
  */
 inline void seed_random(unsigned int seed)
 {
-    srand(seed);
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::theRNG() = cv::RNG(seed);
+#else
+    std::srand(seed);
+#endif
 }
 
 /*
@@ -60,7 +78,7 @@ inline void seed_random(unsigned int seed)
  */
 inline double rand_double(double high = 1.0, double low = 0)
 {
-    return low + ((high-low) * (std::rand() / (RAND_MAX + 1.0)));
+    return low + ((high-low) * (rand() / (RAND_MAX + 1.0)));
 }
 
 /**
@@ -71,7 +89,7 @@ inline double rand_double(double high = 1.0, double low = 0)
  */
 inline int rand_int(int high = RAND_MAX, int low = 0)
 {
-    return low + (int) ( double(high-low) * (std::rand() / (RAND_MAX + 1.0)));
+    return low + (int) ( double(high-low) * (rand() / (RAND_MAX + 1.0)));
 }
 
 /**
@@ -107,7 +125,11 @@ public:
         for (int i = 0; i < size_; ++i) vals_[i] = i;
 
         // shuffle the elements in the array
+#ifndef OPENCV_FLANN_USE_STD_RAND
+        cv::randShuffle(vals_);
+#else
         std::random_shuffle(vals_.begin(), vals_.end());
+#endif
 
         counter_ = 0;
     }
diff --git a/modules/flann/misc/python/pyopencv_flann.hpp b/modules/flann/misc/python/pyopencv_flann.hpp
index a9da8d0f00..6591b90aa6 100644
--- a/modules/flann/misc/python/pyopencv_flann.hpp
+++ b/modules/flann/misc/python/pyopencv_flann.hpp
@@ -23,6 +23,9 @@ bool pyopencv_to(PyObject *o, cv::flann::IndexParams& p, const char *name)
     PyObject* item = NULL;
     Py_ssize_t pos = 0;
 
+    if (!o || o == Py_None)
+        return true;
+
     if(PyDict_Check(o)) {
         while(PyDict_Next(o, &pos, &key, &item)) {
             if( !PyString_Check(key) ) {
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index b0363c4482..472d66799a 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(the_description "High-level GUI and Media I/O")
-ocv_add_module(highgui opencv_imgproc OPTIONAL opencv_imgcodecs opencv_videoio WRAP python)
+ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python)
 
 # ----------------------------------------------------------------------------
 #  CMake file for highgui. See root CMakeLists.txt
@@ -65,7 +65,7 @@ elseif(HAVE_QT)
 
   list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES})
   list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_QT.cpp ${_MOC_OUTFILES} ${_RCC_OUTFILES})
-  ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag)
+  ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag "")
   if(${_have_flag})
     set_source_files_properties(${_RCC_OUTFILES} PROPERTIES COMPILE_FLAGS -Wno-missing-declarations)
   endif()
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index d9e7ad8b1c..40af37cd86 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -50,10 +50,8 @@
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/highgui/highgui_c.h"
 
-#ifdef HAVE_OPENCV_IMGCODECS
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/imgcodecs/imgcodecs_c.h"
-#endif
 
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h
index b08d7133cf..8be4bf4f48 100644
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -42,6 +42,10 @@
 
 #include "precomp.hpp"
 
+#ifndef _DEBUG
+#define QT_NO_DEBUG_OUTPUT
+#endif
+
 #if defined( HAVE_QT_OPENGL )
 #include <QtOpenGL>
 #include <QGLWidget>
diff --git a/modules/imgcodecs/src/ios_conversions.mm b/modules/imgcodecs/src/ios_conversions.mm
index eed867a790..202cfe30d1 100644
--- a/modules/imgcodecs/src/ios_conversions.mm
+++ b/modules/imgcodecs/src/ios_conversions.mm
@@ -53,7 +53,7 @@ void UIImageToMat(const UIImage* image, cv::Mat& m, bool alphaExist);
 UIImage* MatToUIImage(const cv::Mat& image) {
 
     NSData *data = [NSData dataWithBytes:image.data
-                                  length:image.elemSize()*image.total()];
+                                  length:image.step.p[0] * image.rows];
 
     CGColorSpaceRef colorSpace;
 
@@ -73,7 +73,7 @@ UIImage* MatToUIImage(const cv::Mat& image) {
     // Creating CGImage from cv::Mat
     CGImageRef imageRef = CGImageCreate(image.cols,
                                         image.rows,
-                                        8,
+                                        8 * image.elemSize1(),
                                         8 * image.elemSize(),
                                         image.step.p[0],
                                         colorSpace,
@@ -97,7 +97,7 @@ UIImage* MatToUIImage(const cv::Mat& image) {
 void UIImageToMat(const UIImage* image,
                          cv::Mat& m, bool alphaExist) {
     CGColorSpaceRef colorSpace = CGImageGetColorSpace(image.CGImage);
-    CGFloat cols = image.size.width, rows = image.size.height;
+    CGFloat cols = CGImageGetWidth(image.CGImage), rows = CGImageGetHeight(image.CGImage);
     CGContextRef contextRef;
     CGBitmapInfo bitmapInfo = kCGImageAlphaPremultipliedLast;
     if (CGColorSpaceGetModel(colorSpace) == kCGColorSpaceModelMonochrome)
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 9914f63ff7..8536e41cc8 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -452,6 +452,20 @@ enum ContourApproximationModes {
     CHAIN_APPROX_TC89_KCOS = 4
 };
 
+/** @brief Shape matching methods
+
+\f$A\f$ denotes object1,\f$B\f$ denotes object2
+
+\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
+
+and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
+*/
+enum ShapeMatchModes {
+    CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
 //! @} imgproc_shape
 
 //! Variants of a Hough transform
@@ -2588,9 +2602,8 @@ The function supports multi-channel images. Each channel is processed independen
 The functions accumulate\* can be used, for example, to collect statistics of a scene background
 viewed by a still camera and for the further foreground-background segmentation.
 
-@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
-@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
-floating-point.
+@param src Input image of type CV_8UC(n), CV_16UC(n), CV_32FC(n) or CV_64FC(n), where n is a positive integer.
+@param dst %Accumulator image with the same number of channels as input image, and a depth of CV_32F or CV_64F.
 @param mask Optional operation mask.
 
 @sa  accumulateSquare, accumulateProduct, accumulateWeighted
@@ -3916,7 +3929,7 @@ The function compares two shapes. All three implemented methods use the Hu invar
 
 @param contour1 First contour or grayscale image.
 @param contour2 Second contour or grayscale image.
-@param method Comparison method, see ::ShapeMatchModes
+@param method Comparison method, see cv::ShapeMatchModes
 @param parameter Method-specific parameter (not supported now).
  */
 CV_EXPORTS_W double matchShapes( InputArray contour1, InputArray contour2,
@@ -4081,7 +4094,13 @@ CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
 //! Detects position, translation and rotation
 CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
 
-//! Performs linear blending of two images
+//! Performs linear blending of two images:
+//! \f[ \texttt{dst}(i,j) = \texttt{weights1}(i,j)*\texttt{src1}(i,j) + \texttt{weights2}(i,j)*\texttt{src2}(i,j) \f]
+//! @param src1 It has a type of CV_8UC(n) or CV_32FC(n), where n is a positive integer.
+//! @param src2 It has the same type and size as src1.
+//! @param weights1 It has a type of CV_32FC1 and the same size with src1.
+//! @param weights2 It has a type of CV_32FC1 and the same size with src1.
+//! @param dst It is created if it does not have the same size and type with src1.
 CV_EXPORTS void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);
 
 //! @addtogroup imgproc_colormap
diff --git a/modules/imgproc/include/opencv2/imgproc/types_c.h b/modules/imgproc/include/opencv2/imgproc/types_c.h
index ca487d2bd4..13ffe1b1a3 100644
--- a/modules/imgproc/include/opencv2/imgproc/types_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/types_c.h
@@ -501,15 +501,8 @@ enum
     CV_POLY_APPROX_DP = 0
 };
 
-/** @brief Shape matching methods
-
-\f$A\f$ denotes object1,\f$B\f$ denotes object2
-
-\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
-
-and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
-*/
-enum ShapeMatchModes
+/** Shape matching methods */
+enum
 {
     CV_CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
     CV_CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 6c4da5b5f3..f8e1134072 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1649,7 +1649,7 @@ struct VResizeLanczos4
     {
         CastOp castOp;
         VecOp vecOp;
-        int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
         #if CV_ENABLE_UNROLLED
         for( ; x <= width - 4; x += 4 )
         {
@@ -1657,7 +1657,7 @@ struct VResizeLanczos4
             const WT* S = src[0];
             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
 
-            for( k = 1; k < 8; k++ )
+            for( int k = 1; k < 8; k++ )
             {
                 b = beta[k]; S = src[k];
                 s0 += S[x]*b; s1 += S[x+1]*b;
@@ -3533,14 +3533,13 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                           int borderType, const Scalar& _borderValue )
 {
     Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
     const T* S0 = _src.ptr<T>();
+    T cval[CV_CN_MAX];
     size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);
 
     unsigned width1 = ssize.width, height1 = ssize.height;
 
@@ -3550,14 +3549,14 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
         dsize.height = 1;
     }
 
-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
     {
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
 
         if( cn == 1 )
         {
-            for( dx = 0; dx < dsize.width; dx++ )
+            for(int dx = 0; dx < dsize.width; dx++ )
             {
                 int sx = XY[dx*2], sy = XY[dx*2+1];
                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
@@ -3583,9 +3582,9 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
         }
         else
         {
-            for( dx = 0; dx < dsize.width; dx++, D += cn )
+            for(int dx = 0; dx < dsize.width; dx++, D += cn )
             {
-                int sx = XY[dx*2], sy = XY[dx*2+1], k;
+                int sx = XY[dx*2], sy = XY[dx*2+1];
                 const T *S;
                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
                 {
@@ -3602,7 +3601,7 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                     else
                     {
                         S = S0 + sy*sstep + sx*cn;
-                        for( k = 0; k < cn; k++ )
+                        for(int k = 0; k < cn; k++ )
                             D[k] = S[k];
                     }
                 }
@@ -3622,7 +3621,7 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                         sy = borderInterpolate(sy, ssize.height, borderType);
                         S = S0 + sy*sstep + sx*cn;
                     }
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                         D[k] = S[k];
                 }
             }
@@ -3852,16 +3851,15 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
     Size ssize = _src.size(), dsize = _dst.size();
-    int k, cn = _src.channels();
+    const int cn = _src.channels();
     const AT* wtab = (const AT*)_wtab;
     const T* S0 = _src.ptr<T>();
     size_t sstep = _src.step/sizeof(S0[0]);
     T cval[CV_CN_MAX];
-    int dx, dy;
     CastOp castOp;
     VecOp vecOp;
 
-    for( k = 0; k < cn; k++ )
+    for(int k = 0; k < cn; k++ )
         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
 
     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
@@ -3871,7 +3869,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
         width1 = std::max(ssize.width-2, 0);
 #endif
 
-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
     {
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
@@ -3879,7 +3877,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
         int X0 = 0;
         bool prevInlier = false;
 
-        for( dx = 0; dx <= dsize.width; dx++ )
+        for(int dx = 0; dx <= dsize.width; dx++ )
         {
             bool curInlier = dx < dsize.width ?
                 (unsigned)XY[dx*2] < width1 &&
@@ -3948,7 +3946,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                         int sx = XY[dx*2], sy = XY[dx*2+1];
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx*cn;
-                        for( k = 0; k < cn; k++ )
+                        for(int k = 0; k < cn; k++ )
                         {
                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
                             D[k] = castOp(t0);
@@ -4012,7 +4010,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                             (sx >= ssize.width || sx+1 < 0 ||
                              sy >= ssize.height || sy+1 < 0) )
                         {
-                            for( k = 0; k < cn; k++ )
+                            for(int k = 0; k < cn; k++ )
                                 D[k] = cval[k];
                         }
                         else
@@ -4046,7 +4044,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
                             }
-                            for( k = 0; k < cn; k++ )
+                            for(int k = 0; k < cn; k++ )
                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
                         }
                     }
@@ -4064,16 +4062,16 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
     Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
     const AT* wtab = (const AT*)_wtab;
     const T* S0 = _src.ptr<T>();
     size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+    T cval[CV_CN_MAX];
     CastOp castOp;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);
+
     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
 
     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
@@ -4084,21 +4082,20 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
         dsize.height = 1;
     }
 
-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
     {
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
         const ushort* FXY = _fxy.ptr<ushort>(dy);
 
-        for( dx = 0; dx < dsize.width; dx++, D += cn )
+        for(int dx = 0; dx < dsize.width; dx++, D += cn )
         {
             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
             const AT* w = wtab + FXY[dx]*16;
-            int i, k;
             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
             {
                 const T* S = S0 + sy*sstep + sx*cn;
-                for( k = 0; k < cn; k++ )
+                for(int k = 0; k < cn; k++ )
                 {
                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
                     S += sstep;
@@ -4123,21 +4120,21 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
                     (sx >= ssize.width || sx+4 <= 0 ||
                     sy >= ssize.height || sy+4 <= 0))
                 {
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                         D[k] = cval[k];
                     continue;
                 }
 
-                for( i = 0; i < 4; i++ )
+                for(int i = 0; i < 4; i++ )
                 {
                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
                 }
 
-                for( k = 0; k < cn; k++, S0++, w -= 16 )
+                for(int k = 0; k < cn; k++, S0++, w -= 16 )
                 {
                     WT cv = cval[k], sum = cv*ONE;
-                    for( i = 0; i < 4; i++, w += 4 )
+                    for(int i = 0; i < 4; i++, w += 4 )
                     {
                         int yi = y[i];
                         const T* S = S0 + yi*sstep;
@@ -4169,16 +4166,16 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
     Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
     const AT* wtab = (const AT*)_wtab;
     const T* S0 = _src.ptr<T>();
     size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+    T cval[CV_CN_MAX];
     CastOp castOp;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);
+
     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
 
     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
@@ -4189,21 +4186,20 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
         dsize.height = 1;
     }
 
-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
     {
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
         const ushort* FXY = _fxy.ptr<ushort>(dy);
 
-        for( dx = 0; dx < dsize.width; dx++, D += cn )
+        for(int dx = 0; dx < dsize.width; dx++, D += cn )
         {
             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
             const AT* w = wtab + FXY[dx]*64;
             const T* S = S0 + sy*sstep + sx*cn;
-            int i, k;
             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
             {
-                for( k = 0; k < cn; k++ )
+                for(int k = 0; k < cn; k++ )
                 {
                     WT sum = 0;
                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
@@ -4226,21 +4222,21 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
                     (sx >= ssize.width || sx+8 <= 0 ||
                     sy >= ssize.height || sy+8 <= 0))
                 {
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                         D[k] = cval[k];
                     continue;
                 }
 
-                for( i = 0; i < 8; i++ )
+                for(int i = 0; i < 8; i++ )
                 {
                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
                 }
 
-                for( k = 0; k < cn; k++, S0++, w -= 64 )
+                for(int k = 0; k < cn; k++, S0++, w -= 64 )
                 {
                     WT cv = cval[k], sum = cv*ONE;
-                    for( i = 0; i < 8; i++, w += 8 )
+                    for(int i = 0; i < 8; i++, w += 8 )
                     {
                         int yi = y[i];
                         const T* S1 = S0 + yi*sstep;
diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl
index 9ceafd71aa..daa0483e10 100644
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -160,6 +160,7 @@ enum
 #define CAT(x, y) __CAT(x, y)
 
 #define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
 
 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
 
@@ -182,7 +183,7 @@ __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offs
             {
                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE_3 src_pix = vload3(0, src);
 #ifdef DEPTH_5
                 dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
 #else
@@ -256,7 +257,7 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
             {
                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE_3 src_pix = vload3(0, src);
                 DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
 
 #ifdef DEPTH_5
diff --git a/modules/imgproc/src/undistort.cpp b/modules/imgproc/src/undistort.cpp
index 127481fe0b..6cbb4e2e87 100644
--- a/modules/imgproc/src/undistort.cpp
+++ b/modules/imgproc/src/undistort.cpp
@@ -476,8 +476,6 @@ static Point2f mapPointSpherical(const Point2f& p, float alpha, Vec4d* J, int pr
 
 static Point2f invMapPointSpherical(Point2f _p, float alpha, int projType)
 {
-    static int avgiter = 0, avgn = 0;
-
     double eps = 1e-12;
     Vec2d p(_p.x, _p.y), q(_p.x, _p.y), err;
     Vec4d J;
@@ -502,14 +500,6 @@ static Point2f invMapPointSpherical(Point2f _p, float alpha, int projType)
         //q -= Vec2d((J.t()*J).inv()*(J.t()*err));
     }
 
-    if( i < maxiter )
-    {
-        avgiter += i;
-        avgn++;
-        if( avgn == 1500 )
-            printf("avg iters = %g\n", (double)avgiter/avgn);
-    }
-
     return i < maxiter ? Point2f((float)q[0], (float)q[1]) : Point2f(-FLT_MAX, -FLT_MAX);
 }
 
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 8246754387..8200e56d00 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1686,22 +1686,35 @@ TEST(Resize, Area_half)
 
 TEST(Imgproc_Warp, multichannel)
 {
+    static const int inter_types[] = {INTER_NEAREST, INTER_AREA, INTER_CUBIC,
+                                      INTER_LANCZOS4, INTER_LINEAR};
+    static const int inter_n = sizeof(inter_types) / sizeof(int);
+
+    static const int border_types[] = {BORDER_CONSTANT, BORDER_DEFAULT,
+                                       BORDER_REFLECT, BORDER_REPLICATE,
+                                       BORDER_WRAP, BORDER_WRAP};
+    static const int border_n = sizeof(border_types) / sizeof(int);
+
     RNG& rng = theRNG();
-    for( int iter = 0; iter < 30; iter++ )
+    for( int iter = 0; iter < 100; iter++ )
     {
+        int inter = inter_types[rng.uniform(0, inter_n)];
+        int border = border_types[rng.uniform(0, border_n)];
         int width = rng.uniform(3, 333);
         int height = rng.uniform(3, 333);
-        int cn = rng.uniform(1, 10);
+        int cn = rng.uniform(1, 15);
+        if(inter == INTER_CUBIC || inter == INTER_LANCZOS4)
+            cn = rng.uniform(1, 5);
         Mat src(height, width, CV_8UC(cn)), dst;
         //randu(src, 0, 256);
         src.setTo(0.);
 
-        Mat rot = getRotationMatrix2D(Point2f(0.f, 0.f), 1, 1);
-        warpAffine(src, dst, rot, src.size());
+        Mat rot = getRotationMatrix2D(Point2f(0.f, 0.f), 1.0, 1.0);
+        warpAffine(src, dst, rot, src.size(), inter, border);
         ASSERT_EQ(0.0, norm(dst, NORM_INF));
         Mat rot2 = Mat::eye(3, 3, rot.type());
         rot.copyTo(rot2.rowRange(0, 2));
-        warpPerspective(src, dst, rot2, src.size());
+        warpPerspective(src, dst, rot2, src.size(), inter, border);
         ASSERT_EQ(0.0, norm(dst, NORM_INF));
     }
 }
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index 3c8da0b450..e63df430bd 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -14,7 +14,8 @@ class_ignore_list = (
     #core
     "FileNode", "FileStorage", "KDTree", "KeyPoint", "DMatch",
     #features2d
-    "SimpleBlobDetector"
+    "SimpleBlobDetector",
+    "CirclesGridFinderParameters"
 )
 
 const_ignore_list = (
@@ -862,10 +863,13 @@ class ClassInfo(GeneralInfo):
         self.j_code = StringIO()
         self.jn_code = StringIO()
         self.cpp_code = StringIO();
-        if self.name != Module:
-            self.j_code.write(T_JAVA_START_INHERITED if self.base else T_JAVA_START_ORPHAN)
+        if self.base:
+            self.j_code.write(T_JAVA_START_INHERITED)
         else:
-            self.j_code.write(T_JAVA_START_MODULE)
+            if self.name != Module:
+                self.j_code.write(T_JAVA_START_ORPHAN)
+            else:
+                self.j_code.write(T_JAVA_START_MODULE)
         # misc handling
         if self.name == 'Core':
             self.imports.add("java.lang.String")
@@ -962,11 +966,11 @@ class JavaWrapperGenerator(object):
             logging.info('ignored: %s', classinfo)
             return
         name = classinfo.name
-        if self.isWrapped(name):
+        if self.isWrapped(name) and not classinfo.base:
             logging.warning('duplicated: %s', classinfo)
             return
         self.classes[name] = classinfo
-        if name in type_dict:
+        if name in type_dict and not classinfo.base:
             logging.warning('duplicated: %s', classinfo)
             return
         type_dict[name] = \
@@ -1520,7 +1524,7 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
                 ci.jn_code.write( ManualFuncs[ci.name][func]["jn_code"] )
                 ci.cpp_code.write( ManualFuncs[ci.name][func]["cpp_code"] )
 
-        if ci.name != self.Module:
+        if ci.name != self.Module or ci.base:
             # finalize()
             ci.j_code.write(
 """
diff --git a/modules/java/generator/src/java/android+AsyncServiceHelper.java b/modules/java/generator/src/java/android+AsyncServiceHelper.java
index 4d9d115389..ebee2bb7f5 100644
--- a/modules/java/generator/src/java/android+AsyncServiceHelper.java
+++ b/modules/java/generator/src/java/android+AsyncServiceHelper.java
@@ -131,7 +131,7 @@ class AsyncServiceHelper
                 }
                 public void cancel()
                 {
-                    Log.d(TAG, "Wating for OpenCV canceled by user");
+                    Log.d(TAG, "Waiting for OpenCV canceled by user");
                     mServiceInstallationProgress = false;
                     int Status = LoaderCallbackInterface.INSTALL_CANCELED;
                     Log.d(TAG, "Init finished with status " + Status);
@@ -197,7 +197,7 @@ class AsyncServiceHelper
                                         if (mEngineService.installVersion(mOpenCVersion))
                                         {
                                             mLibraryInstallationProgress = true;
-                                            Log.d(TAG, "Package installation statred");
+                                            Log.d(TAG, "Package installation started");
                                             Log.d(TAG, "Unbind from service");
                                             mAppContext.unbindService(mServiceConnection);
                                         }
@@ -228,7 +228,7 @@ class AsyncServiceHelper
                                     mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INSTALL_CANCELED);
                                 }
                                 public void wait_install() {
-                                    Log.e(TAG, "Instalation was not started! Nothing to wait!");
+                                    Log.e(TAG, "Installation was not started! Nothing to wait!");
                                 }
                             };
 
diff --git a/modules/java/generator/src/java/android+BaseLoaderCallback.java b/modules/java/generator/src/java/android+BaseLoaderCallback.java
index 6d6a9b8dad..0b8aeedc6a 100644
--- a/modules/java/generator/src/java/android+BaseLoaderCallback.java
+++ b/modules/java/generator/src/java/android+BaseLoaderCallback.java
@@ -43,7 +43,7 @@ public abstract class BaseLoaderCallback implements LoaderCallbackInterface {
             /** Package installation has been canceled. **/
             case LoaderCallbackInterface.INSTALL_CANCELED:
             {
-                Log.d(TAG, "OpenCV library instalation was canceled by user");
+                Log.d(TAG, "OpenCV library installation was canceled by user");
                 finish();
             } break;
             /** Application is incompatible with this version of OpenCV Manager. Possibly, a service update is required. **/
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index ef2af818cf..54871057e2 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -38,7 +38,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
     private Bitmap mCacheBitmap;
     private CvCameraViewListener2 mListener;
     private boolean mSurfaceExist;
-    private Object mSyncObject = new Object();
+    private final Object mSyncObject = new Object();
 
     protected int mFrameWidth;
     protected int mFrameHeight;
diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java
index 70e5193932..2d972c9ce6 100644
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -275,7 +275,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
             synchronized (this) {
                 this.notify();
             }
-            Log.d(TAG, "Wating for thread");
+            Log.d(TAG, "Waiting for thread");
             if (mThread != null)
                 mThread.join();
         } catch (InterruptedException e) {
diff --git a/modules/java/pure_test/CMakeLists.txt b/modules/java/pure_test/CMakeLists.txt
index 7d78414994..95b7e47512 100644
--- a/modules/java/pure_test/CMakeLists.txt
+++ b/modules/java/pure_test/CMakeLists.txt
@@ -1,4 +1,6 @@
-if(NOT ANT_EXECUTABLE)
+if(NOT ANT_EXECUTABLE
+  OR NOT BUILD_opencv_imgcodecs
+  OR NOT BUILD_opencv_calib3d)
   return()
 endif()
 
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index 669e2d004e..a8b7114674 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -104,7 +104,7 @@ enum SampleTypes
 It is used for optimizing statmodel accuracy by varying model parameters, the accuracy estimate
 being computed by cross-validation.
  */
-class CV_EXPORTS ParamGrid
+class CV_EXPORTS_W ParamGrid
 {
 public:
     /** @brief Default constructor */
@@ -112,8 +112,8 @@ public:
     /** @brief Constructor with parameters */
     ParamGrid(double _minVal, double _maxVal, double _logStep);
 
-    double minVal; //!< Minimum value of the statmodel parameter. Default value is 0.
-    double maxVal; //!< Maximum value of the statmodel parameter. Default value is 0.
+    CV_PROP_RW double minVal; //!< Minimum value of the statmodel parameter. Default value is 0.
+    CV_PROP_RW double maxVal; //!< Maximum value of the statmodel parameter. Default value is 0.
     /** @brief Logarithmic step for iterating the statmodel parameter.
 
     The grid determines the following iteration sequence of the statmodel parameter values:
@@ -122,7 +122,15 @@ public:
     \f[\texttt{minVal} * \texttt{logStep} ^n <  \texttt{maxVal}\f]
     The grid is logarithmic, so logStep must always be greater then 1. Default value is 1.
     */
-    double logStep;
+    CV_PROP_RW double logStep;
+
+    /** @brief Creates a ParamGrid Ptr that can be given to the %SVM::trainAuto method
+
+    @param minVal minimum value of the parameter grid
+    @param maxVal maximum value of the parameter grid
+    @param logstep Logarithmic step for iterating the statmodel parameter
+    */
+    CV_WRAP static Ptr<ParamGrid> create(double minVal=0., double maxVal=0., double logstep=1.);
 };
 
 /** @brief Class encapsulating training data.
@@ -683,14 +691,54 @@ public:
     the usual %SVM with parameters specified in params is executed.
      */
     virtual bool trainAuto( const Ptr<TrainData>& data, int kFold = 10,
-                    ParamGrid Cgrid = SVM::getDefaultGrid(SVM::C),
-                    ParamGrid gammaGrid  = SVM::getDefaultGrid(SVM::GAMMA),
-                    ParamGrid pGrid      = SVM::getDefaultGrid(SVM::P),
-                    ParamGrid nuGrid     = SVM::getDefaultGrid(SVM::NU),
-                    ParamGrid coeffGrid  = SVM::getDefaultGrid(SVM::COEF),
-                    ParamGrid degreeGrid = SVM::getDefaultGrid(SVM::DEGREE),
+                    ParamGrid Cgrid = getDefaultGrid(C),
+                    ParamGrid gammaGrid  = getDefaultGrid(GAMMA),
+                    ParamGrid pGrid      = getDefaultGrid(P),
+                    ParamGrid nuGrid     = getDefaultGrid(NU),
+                    ParamGrid coeffGrid  = getDefaultGrid(COEF),
+                    ParamGrid degreeGrid = getDefaultGrid(DEGREE),
                     bool balanced=false) = 0;
 
+    /** @brief Trains an %SVM with optimal parameters
+
+    @param samples training samples
+    @param layout See ml::SampleTypes.
+    @param responses vector of responses associated with the training samples.
+    @param kFold Cross-validation parameter. The training set is divided into kFold subsets. One
+        subset is used to test the model, the others form the train set. So, the %SVM algorithm is
+    @param Cgrid grid for C
+    @param gammaGrid grid for gamma
+    @param pGrid grid for p
+    @param nuGrid grid for nu
+    @param coeffGrid grid for coeff
+    @param degreeGrid grid for degree
+    @param balanced If true and the problem is 2-class classification then the method creates more
+        balanced cross-validation subsets that is proportions between classes in subsets are close
+        to such proportion in the whole train dataset.
+
+    The method trains the %SVM model automatically by choosing the optimal parameters C, gamma, p,
+    nu, coef0, degree. Parameters are considered optimal when the cross-validation
+    estimate of the test set error is minimal.
+
+    This function only makes use of SVM::getDefaultGrid for parameter optimization and thus only
+    offers rudimentary parameter options.
+
+    This function works for the classification (SVM::C_SVC or SVM::NU_SVC) as well as for the
+    regression (SVM::EPS_SVR or SVM::NU_SVR). If it is SVM::ONE_CLASS, no optimization is made and
+    the usual %SVM with parameters specified in params is executed.
+    */
+    CV_WRAP bool trainAuto(InputArray samples,
+            int layout,
+            InputArray responses,
+            int kFold = 10,
+            Ptr<ParamGrid> Cgrid = SVM::getDefaultGridPtr(SVM::C),
+            Ptr<ParamGrid> gammaGrid  = SVM::getDefaultGridPtr(SVM::GAMMA),
+            Ptr<ParamGrid> pGrid      = SVM::getDefaultGridPtr(SVM::P),
+            Ptr<ParamGrid> nuGrid     = SVM::getDefaultGridPtr(SVM::NU),
+            Ptr<ParamGrid> coeffGrid  = SVM::getDefaultGridPtr(SVM::COEF),
+            Ptr<ParamGrid> degreeGrid = SVM::getDefaultGridPtr(SVM::DEGREE),
+            bool balanced=false);
+
     /** @brief Retrieves all the support vectors
 
     The method returns all the support vectors as a floating-point matrix, where support vectors are
@@ -733,6 +781,16 @@ public:
      */
     static ParamGrid getDefaultGrid( int param_id );
 
+    /** @brief Generates a grid for %SVM parameters.
+
+    @param param_id %SVM parameters IDs that must be one of the SVM::ParamTypes. The grid is
+    generated for the parameter with this ID.
+
+    The function generates a grid pointer for the specified parameter of the %SVM algorithm.
+    The grid may be passed to the function SVM::trainAuto.
+     */
+    CV_WRAP static Ptr<ParamGrid> getDefaultGridPtr( int param_id );
+
     /** Creates empty model.
     Use StatModel::train to train the model. Since %SVM has several parameters, you may want to
     find the best parameters for your problem, it can be done with SVM::trainAuto. */
diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp
index 3966906823..819d409cd0 100644
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@@ -50,6 +50,10 @@ ParamGrid::ParamGrid(double _minVal, double _maxVal, double _logStep)
     logStep = std::max(_logStep, 1.);
 }
 
+Ptr<ParamGrid> ParamGrid::create(double minval, double maxval, double logstep) {
+  return makePtr<ParamGrid>(minval, maxval, logstep);
+}
+
 bool StatModel::empty() const { return !isTrained(); }
 
 int StatModel::getVarCount() const { return 0; }
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 5e5b89163e..d486d26655 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -362,6 +362,12 @@ static void sortSamplesByClasses( const Mat& _samples, const Mat& _responses,
 
 //////////////////////// SVM implementation //////////////////////////////
 
+Ptr<ParamGrid> SVM::getDefaultGridPtr( int param_id)
+{
+  ParamGrid grid = getDefaultGrid(param_id); // this is not a nice solution..
+  return makePtr<ParamGrid>(grid.minVal, grid.maxVal, grid.logStep);
+}
+
 ParamGrid SVM::getDefaultGrid( int param_id )
 {
     ParamGrid grid;
@@ -1920,6 +1926,24 @@ public:
         bool returnDFVal;
     };
 
+    bool trainAuto_(InputArray samples, int layout,
+            InputArray responses, int kfold, Ptr<ParamGrid> Cgrid,
+            Ptr<ParamGrid> gammaGrid, Ptr<ParamGrid> pGrid, Ptr<ParamGrid> nuGrid,
+            Ptr<ParamGrid> coeffGrid, Ptr<ParamGrid> degreeGrid, bool balanced)
+    {
+        Ptr<TrainData> data = TrainData::create(samples, layout, responses);
+        return this->trainAuto(
+                data, kfold,
+                *Cgrid.get(),
+                *gammaGrid.get(),
+                *pGrid.get(),
+                *nuGrid.get(),
+                *coeffGrid.get(),
+                *degreeGrid.get(),
+                balanced);
+    }
+
+
     float predict( InputArray _samples, OutputArray _results, int flags ) const
     {
         float result = 0;
@@ -2281,6 +2305,19 @@ Mat SVM::getUncompressedSupportVectors() const
     return this_->getUncompressedSupportVectors_();
 }
 
+bool SVM::trainAuto(InputArray samples, int layout,
+            InputArray responses, int kfold, Ptr<ParamGrid> Cgrid,
+            Ptr<ParamGrid> gammaGrid, Ptr<ParamGrid> pGrid, Ptr<ParamGrid> nuGrid,
+            Ptr<ParamGrid> coeffGrid, Ptr<ParamGrid> degreeGrid, bool balanced)
+{
+  SVMImpl* this_ = dynamic_cast<SVMImpl*>(this);
+  if (!this_) {
+    CV_Error(Error::StsNotImplemented, "the class is not SVMImpl");
+  }
+  return this_->trainAuto_(samples, layout, responses,
+    kfold, Cgrid, gammaGrid, pGrid, nuGrid, coeffGrid, degreeGrid, balanced);
+}
+
 }
 }
 
diff --git a/modules/objdetect/CMakeLists.txt b/modules/objdetect/CMakeLists.txt
index d42e9e8f11..862d564cc7 100644
--- a/modules/objdetect/CMakeLists.txt
+++ b/modules/objdetect/CMakeLists.txt
@@ -1,2 +1,2 @@
 set(the_description "Object Detection")
-ocv_define_module(objdetect opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_highgui WRAP java python)
+ocv_define_module(objdetect opencv_core opencv_imgproc WRAP java python)
diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp
index 51843fa488..bb37ee91e0 100644
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@@ -824,10 +824,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                                CvPoint pt, double& stage_sum, int start_stage )
 {
 #ifdef CV_HAAR_USE_AVX
-    bool haveAVX = false;
-    if(cv::checkHardwareSupport(CV_CPU_AVX))
-    if(__xgetbv()&0x6)// Check if the OS will save the YMM registers
-       haveAVX = true;
+    bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX);
 #else
 #  ifdef CV_HAAR_USE_SSE
     bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
diff --git a/modules/objdetect/src/precomp.hpp b/modules/objdetect/src/precomp.hpp
index 97b976baf2..448d8b19b7 100644
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@@ -46,16 +46,8 @@
 #include "opencv2/objdetect.hpp"
 #include "opencv2/imgproc.hpp"
 
-#include "opencv2/ml.hpp"
-
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/ocl.hpp"
-
-#include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_HIGHGUI
-#  include "opencv2/highgui.hpp"
-#endif
-
 #include "opencv2/core/private.hpp"
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index cfaf01efc7..9c28c7070f 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -798,6 +798,21 @@ PyObject* pyopencv_from(const Size& sz)
     return Py_BuildValue("(ii)", sz.width, sz.height);
 }
 
+template<>
+bool pyopencv_to(PyObject* obj, Size_<float>& sz, const char* name)
+{
+    (void)name;
+    if(!obj || obj == Py_None)
+        return true;
+    return PyArg_ParseTuple(obj, "ff", &sz.width, &sz.height) > 0;
+}
+
+template<>
+PyObject* pyopencv_from(const Size_<float>& sz)
+{
+    return Py_BuildValue("(ff)", sz.width, sz.height);
+}
+
 template<>
 bool pyopencv_to(PyObject* obj, Rect& r, const char* name)
 {
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 0e4f39a99d..2802ab8be7 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -8,6 +8,6 @@ set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
 if(BUILD_SHARED_LIBS AND BUILD_opencv_world AND OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
   set(STITCHING_CONTRIB_DEPS "")
 endif()
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect
+ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d
                   OPTIONAL opencv_cudaarithm opencv_cudafilters opencv_cudafeatures2d opencv_cudalegacy ${STITCHING_CONTRIB_DEPS}
                   WRAP python)
diff --git a/modules/stitching/perf/perf_stich.cpp b/modules/stitching/perf/perf_stich.cpp
index ded2571585..ee78d6d2f4 100644
--- a/modules/stitching/perf/perf_stich.cpp
+++ b/modules/stitching/perf/perf_stich.cpp
@@ -2,6 +2,8 @@
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/opencv_modules.hpp"
 
+#include "opencv2/core/ocl.hpp"
+
 using namespace std;
 using namespace cv;
 using namespace perf;
@@ -161,6 +163,9 @@ PERF_TEST_P(stitchDatasets, affine, testing::Combine(AFFINE_DATASETS, TEST_DETEC
         Ptr<Stitcher> stitcher = Stitcher::create(Stitcher::SCANS, false);
         stitcher->setFeaturesFinder(featuresFinder);
 
+        if (cv::ocl::useOpenCL())
+            cv::theRNG() = cv::RNG(12345); // prevent fails of Windows OpenCL builds (see #8294)
+
         startTimer();
         stitcher->stitch(imgs, pano);
         stopTimer();
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index edd6b61931..a02bd8b593 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -70,9 +70,12 @@ struct MatchPairsBody : ParallelLoopBody
 
     void operator ()(const Range &r) const
     {
+        cv::RNG rng = cv::theRNG(); // save entry rng state
         const int num_images = static_cast<int>(features.size());
         for (int i = r.start; i < r.end; ++i)
         {
+            cv::theRNG() = cv::RNG(rng.state + i); // force "stable" RNG seed for each processed pair
+
             int from = near_pairs[i].first;
             int to = near_pairs[i].second;
             int pair_idx = from*num_images + to;
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index f7b9172a33..591b5fe883 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -261,6 +261,8 @@ Stitcher::Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArra
     double compose_scale = 1;
     bool is_compose_scale_set = false;
 
+    std::vector<detail::CameraParams> cameras_scaled(cameras_);
+
     UMat full_img, img;
     for (size_t img_idx = 0; img_idx < imgs_.size(); ++img_idx)
     {
@@ -282,16 +284,16 @@ Stitcher::Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArra
             compose_work_aspect = compose_scale / work_scale_;
 
             // Update warped image scale
-            warped_image_scale_ *= static_cast<float>(compose_work_aspect);
-            w = warper_->create((float)warped_image_scale_);
+            float warp_scale = static_cast<float>(warped_image_scale_ * compose_work_aspect);
+            w = warper_->create(warp_scale);
 
             // Update corners and sizes
             for (size_t i = 0; i < imgs_.size(); ++i)
             {
                 // Update intrinsics
-                cameras_[i].focal *= compose_work_aspect;
-                cameras_[i].ppx *= compose_work_aspect;
-                cameras_[i].ppy *= compose_work_aspect;
+                cameras_scaled[i].ppx *= compose_work_aspect;
+                cameras_scaled[i].ppy *= compose_work_aspect;
+                cameras_scaled[i].focal *= compose_work_aspect;
 
                 // Update corner and size
                 Size sz = full_img_sizes_[i];
@@ -302,8 +304,8 @@ Stitcher::Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArra
                 }
 
                 Mat K;
-                cameras_[i].K().convertTo(K, CV_32F);
-                Rect roi = w->warpRoi(sz, K, cameras_[i].R);
+                cameras_scaled[i].K().convertTo(K, CV_32F);
+                Rect roi = w->warpRoi(sz, K, cameras_scaled[i].R);
                 corners[i] = roi.tl();
                 sizes[i] = roi.size();
             }
@@ -324,7 +326,7 @@ Stitcher::Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArra
         LOGLN(" after resize time: " << ((getTickCount() - compositing_t) / getTickFrequency()) << " sec");
 
         Mat K;
-        cameras_[img_idx].K().convertTo(K, CV_32F);
+        cameras_scaled[img_idx].K().convertTo(K, CV_32F);
 
 #if ENABLE_LOG
         int64 pt = getTickCount();
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 7c3fc14a17..34c9297a1b 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -3,7 +3,6 @@ if(IOS OR WINRT)
 endif()
 
 set(the_description "Super Resolution")
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef -Wshadow)
 ocv_define_module(superres opencv_imgproc opencv_video
                   OPTIONAL opencv_videoio opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec
                   WRAP python)
diff --git a/modules/ts/src/ocl_test.cpp b/modules/ts/src/ocl_test.cpp
index abc8d95559..9f0acdec82 100644
--- a/modules/ts/src/ocl_test.cpp
+++ b/modules/ts/src/ocl_test.cpp
@@ -156,6 +156,9 @@ void dumpOpenCLDevice()
         DUMP_MESSAGE_STDOUT("    Version = " << device.version());
         DUMP_PROPERTY_XML("cv_ocl_current_deviceVersion", device.version());
 
+        DUMP_MESSAGE_STDOUT("    Driver version = " << device.driverVersion());
+        DUMP_PROPERTY_XML("cv_ocl_current_driverVersion", device.driverVersion());
+
         DUMP_MESSAGE_STDOUT("    Compute units = "<< device.maxComputeUnits());
         DUMP_PROPERTY_XML("cv_ocl_current_maxComputeUnits", device.maxComputeUnits());
 
diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm
index ce6e3d074c..225e2fad62 100644
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@@ -293,7 +293,7 @@ void CvCaptureCAM::stopCaptureDevice() {
 
     [mCaptureSession release];
     [mCaptureDeviceInput release];
-    [mCaptureDevice release];
+    // [mCaptureDevice release]; fix #7833
 
     [mCaptureVideoDataOutput release];
     [mCapture release];
diff --git a/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp b/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
index fe321ec21d..71097a852d 100644
--- a/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
+++ b/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
@@ -13,15 +13,25 @@ using namespace cv;
 void detectAndDisplay( Mat frame );
 
 /** Global variables */
-String face_cascade_name = "haarcascade_frontalface_alt.xml";
-String eyes_cascade_name = "haarcascade_eye_tree_eyeglasses.xml";
+String face_cascade_name, eyes_cascade_name;
 CascadeClassifier face_cascade;
 CascadeClassifier eyes_cascade;
 String window_name = "Capture - Face detection";
 
 /** @function main */
-int main( void )
+int main( int argc, const char** argv )
 {
+    CommandLineParser parser(argc, argv,
+        "{help h||}"
+        "{face_cascade|../../data/haarcascades/haarcascade_frontalface_alt.xml|}"
+        "{eyes_cascade|../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml|}");
+
+    cout << "\nThis program demonstrates using the cv::CascadeClassifier class to detect objects (Face + eyes) in a video stream.\n"
+            "You can use Haar or LBP features.\n\n";
+    parser.printMessage();
+
+    face_cascade_name = parser.get<string>("face_cascade");
+    eyes_cascade_name = parser.get<string>("eyes_cascade");
     VideoCapture capture;
     Mat frame;
 
@@ -30,7 +40,7 @@ int main( void )
     if( !eyes_cascade.load( eyes_cascade_name ) ){ printf("--(!)Error loading eyes cascade\n"); return -1; };
 
     //-- 2. Read the video stream
-    capture.open( -1 );
+    capture.open( 0 );
     if ( ! capture.isOpened() ) { printf("--(!)Error opening video capture\n"); return -1; }
 
     while ( capture.read(frame) )
diff --git a/samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp b/samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp
deleted file mode 100644
index e57139b18a..0000000000
--- a/samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * @file objectDetection2.cpp
- * @author A. Huaman ( based in the classic facedetect.cpp in samples/c )
- * @brief A simplified version of facedetect.cpp, show how to load a cascade classifier and how to find objects (Face + eyes) in a video stream - Using LBP here
- */
-#include "opencv2/objdetect.hpp"
-#include "opencv2/videoio.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-
-#include <iostream>
-#include <stdio.h>
-
-using namespace std;
-using namespace cv;
-
-/** Function Headers */
-void detectAndDisplay( Mat frame );
-
-/** Global variables */
-String face_cascade_name = "lbpcascade_frontalface.xml";
-String eyes_cascade_name = "haarcascade_eye_tree_eyeglasses.xml";
-CascadeClassifier face_cascade;
-CascadeClassifier eyes_cascade;
-String window_name = "Capture - Face detection";
-/**
- * @function main
- */
-int main( void )
-{
-    VideoCapture capture;
-    Mat frame;
-
-    //-- 1. Load the cascade
-    if( !face_cascade.load( face_cascade_name ) ){ printf("--(!)Error loading face cascade\n"); return -1; };
-    if( !eyes_cascade.load( eyes_cascade_name ) ){ printf("--(!)Error loading eyes cascade\n"); return -1; };
-
-    //-- 2. Read the video stream
-    capture.open( -1 );
-    if ( ! capture.isOpened() ) { printf("--(!)Error opening video capture\n"); return -1; }
-
-    while ( capture.read(frame) )
-    {
-        if( frame.empty() )
-        {
-            printf(" --(!) No captured frame -- Break!");
-            break;
-        }
-
-        //-- 3. Apply the classifier to the frame
-        detectAndDisplay( frame );
-
-        //-- bail out if escape was pressed
-        char c = (char)waitKey(10);
-        if( c == 27 ) { break; }
-    }
-    return 0;
-}
-
-/**
- * @function detectAndDisplay
- */
-void detectAndDisplay( Mat frame )
-{
-    std::vector<Rect> faces;
-    Mat frame_gray;
-
-    cvtColor( frame, frame_gray, COLOR_BGR2GRAY );
-    equalizeHist( frame_gray, frame_gray );
-
-    //-- Detect faces
-    face_cascade.detectMultiScale( frame_gray, faces, 1.1, 2, 0, Size(80, 80) );
-
-    for( size_t i = 0; i < faces.size(); i++ )
-    {
-        Mat faceROI = frame_gray( faces[i] );
-        std::vector<Rect> eyes;
-
-        //-- In each face, detect eyes
-        eyes_cascade.detectMultiScale( faceROI, eyes, 1.1, 2, 0 |CASCADE_SCALE_IMAGE, Size(30, 30) );
-        if( eyes.size() == 2)
-        {
-            //-- Draw the face
-            Point center( faces[i].x + faces[i].width/2, faces[i].y + faces[i].height/2 );
-            ellipse( frame, center, Size( faces[i].width/2, faces[i].height/2 ), 0, 0, 360, Scalar( 255, 0, 0 ), 2, 8, 0 );
-
-            for( size_t j = 0; j < eyes.size(); j++ )
-            { //-- Draw the eyes
-                Point eye_center( faces[i].x + eyes[j].x + eyes[j].width/2, faces[i].y + eyes[j].y + eyes[j].height/2 );
-                int radius = cvRound( (eyes[j].width + eyes[j].height)*0.25 );
-                circle( frame, eye_center, radius, Scalar( 255, 0, 255 ), 3, 8, 0 );
-            }
-        }
-
-    }
-    //-- Show what you got
-    imshow( window_name, frame );
-}