diff --git a/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h b/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
index 714869cc71..ca86ab40e9 100644
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
@@ -335,7 +335,7 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 #ifdef SDL_STRNCPY_S
 #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
 #else
-#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, l)
+#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, b)
 #endif /* SDL_STRNCPY_S */
 
 #define __itt_fstrdup(s)          strdup(s)
diff --git a/3rdparty/openexr/CMakeLists.txt b/3rdparty/openexr/CMakeLists.txt
index 34ba6f7238..88e06e96cf 100644
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@@ -47,6 +47,10 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -W
                                      -Wsuggest-override -Winconsistent-missing-override
                                      -Wimplicit-fallthrough
 )
+if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
+endif()
+
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4018 /wd4099 /wd4100 /wd4101 /wd4127 /wd4189 /wd4245 /wd4305 /wd4389 /wd4512 /wd4701 /wd4702 /wd4706 /wd4800) # vs2005
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4334) # vs2005 Win64
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244) # vs2008
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index af106a01af..ada9891a7b 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -29,6 +29,9 @@ if(CV_ICC)
       -wd265 -wd858 -wd873 -wd2196
   )
 endif()
+if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
+endif()
 
 # Easier to support different versions of protobufs
 function(append_if_exist OUTPUT_LIST)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c9b98ef5d..2c66ae0950 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1403,7 +1403,17 @@ if(WITH_HALIDE OR HAVE_HALIDE)
 endif()
 
 if(WITH_INF_ENGINE OR HAVE_INF_ENGINE)
-  status("    Inference Engine:"     HAVE_INF_ENGINE     THEN "YES (${INF_ENGINE_LIBRARIES} ${INF_ENGINE_INCLUDE_DIRS})" ELSE NO)
+  if(HAVE_INF_ENGINE)
+    set(__msg "YES")
+    if(DEFINED INF_ENGINE_VERSION)
+      set(__msg "YES (ver ${INF_ENGINE_VERSION})")
+    endif()
+    status("    Inference Engine:" "${__msg}")
+    status("                libs:" "${INF_ENGINE_LIBRARIES}")
+    status("            includes:" "${INF_ENGINE_INCLUDE_DIRS}")
+  else()
+    status("    Inference Engine:"     "NO")
+  endif()
 endif()
 
 if(WITH_EIGEN OR HAVE_EIGEN)
diff --git a/apps/createsamples/utility.cpp b/apps/createsamples/utility.cpp
index bae9a9acac..0ec7e8cb6e 100644
--- a/apps/createsamples/utility.cpp
+++ b/apps/createsamples/utility.cpp
@@ -54,6 +54,10 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/calib3d.hpp"
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 using namespace cv;
 
 #ifndef PATH_MAX
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 05f05585e0..3bdb6fa961 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -8,14 +8,23 @@ if(NOT APPLE AND CV_CLANG)
   return()
 endif()
 
-set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 
-if(ANDROID)
-  set(CUDA_TARGET_OS_VARIANT "Android")
-endif()
-find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+if(((NOT CMAKE_VERSION VERSION_LESS "3.9.0")  # requires https://gitlab.kitware.com/cmake/cmake/merge_requests/663
+      OR OPENCV_CUDA_FORCE_EXTERNAL_CMAKE_MODULE)
+    AND NOT OPENCV_CUDA_FORCE_BUILTIN_CMAKE_MODULE)
+  ocv_update(CUDA_LINK_LIBRARIES_KEYWORD "LINK_PRIVATE")
+  find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+else()
+  # Use OpenCV's patched "FindCUDA" module
+  set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+  if(ANDROID)
+    set(CUDA_TARGET_OS_VARIANT "Android")
+  endif()
+  find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
 
-list(REMOVE_AT CMAKE_MODULE_PATH 0)
+  list(REMOVE_AT CMAKE_MODULE_PATH 0)
+endif()
 
 if(CUDA_FOUND)
   set(HAVE_CUDA 1)
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index ae766a863a..7c75e97e67 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -16,22 +16,32 @@ macro(ie_fail)
 endmacro()
 
 
+find_package(InferenceEngine QUIET)
+if(InferenceEngine_FOUND)
+  set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
+  set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
+  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
+  set(HAVE_INF_ENGINE TRUE)
+  return()
+endif()
+
+ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH)
+
 if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
     set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
-    if(DEFINED ENV{INTEL_CVSDK_DIR})
-        list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}")
-        list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}/inference_engine")
-    endif()
     if(DEFINED INTEL_CVSDK_DIR)
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}")
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/inference_engine")
+        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
+        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
     endif()
 
     if(NOT ie_root_paths)
-        list(APPEND ie_root_paths "/opt/intel/deeplearning_deploymenttoolkit/deployment_tools/inference_engine")
+        list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/")
     endif()
 
     find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths})
+    if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
+      unset(INF_ENGINE_ROOT_DIR CACHE)
+    endif()
 endif()
 
 set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory")
@@ -40,6 +50,7 @@ if(NOT INF_ENGINE_ROOT_DIR
     OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}"
     OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp"
 )
+    message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
     ie_fail()
 endif()
 
@@ -47,19 +58,19 @@ set(INF_ENGINE_LIBRARIES "")
 
 set(ie_lib_list inference_engine)
 
+if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}")
+  set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}")
+endif()
+
 link_directories(
-  ${INTEL_CVSDK_DIR}/inference_engine/external/mkltiny_lnx/lib
-  ${INTEL_CVSDK_DIR}/inference_engine/external/cldnn/lib
+  ${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
+  ${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
 )
 
 foreach(lib ${ie_lib_list})
-    find_library(${lib}
-        NAMES ${lib}
-        # For inference_engine
-        HINTS ${IE_PLUGINS_PATH}
-        HINTS "$ENV{IE_PLUGINS_PATH}"
-    )
+    find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH})
     if(NOT ${lib})
+        message(WARNING "DL IE: Can't find library: '${lib}'")
         ie_fail()
     endif()
     list(APPEND INF_ENGINE_LIBRARIES ${${lib}})
diff --git a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
index a035199ee9..48a4e34fb5 100644
--- a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
+++ b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
@@ -53,48 +53,143 @@ Theory
 Code
 ----
 
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp)
+
 -   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
-@include BasicLinearTransforms.cpp
+    @include samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java)
+
+-   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
+    @include samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py)
+
+-   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
+    @include samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py
+@end_toggle
 
 Explanation
 -----------
 
--#  We begin by creating parameters to save \f$\alpha\f$ and \f$\beta\f$ to be entered by the user:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-parameters
+-   We load an image using @ref cv::imread and save it in a Mat object:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-load
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-load
+@end_toggle
 
--#  We load an image using @ref cv::imread and save it in a Mat object:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-load
--#  Now, since we will make some transformations to this image, we need a new Mat object to store
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-load
+@end_toggle
+
+-   Now, since we will make some transformations to this image, we need a new Mat object to store
     it. Also, we want this to have the following features:
 
     -   Initial pixel values equal to zero
     -   Same size and type as the original image
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-output
-    We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer based on
-    *image.size()* and *image.type()*
 
--#  Now, to perform the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ we will access to each
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-output
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-output
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-output
+@end_toggle
+
+We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer based on
+*image.size()* and *image.type()*
+
+-   We ask now the values of \f$\alpha\f$ and \f$\beta\f$ to be entered by the user:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-parameters
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-parameters
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-parameters
+@end_toggle
+
+-   Now, to perform the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ we will access to each
     pixel in image. Since we are operating with BGR images, we will have three values per pixel (B,
     G and R), so we will also access them separately. Here is the piece of code:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-operation
-    Notice the following:
-    -   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
-        where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
-    -   Since the operation \f$\alpha \cdot p(i,j) + \beta\f$ can give values out of range or not
-        integers (if \f$\alpha\f$ is float), we use cv::saturate_cast to make sure the
-        values are valid.
 
--#  Finally, we create windows and show the images, the usual way.
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-display
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-operation
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-operation
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-operation
+@end_toggle
+
+Notice the following (**C++ code only**):
+-   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
+    where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
+-   Since the operation \f$\alpha \cdot p(i,j) + \beta\f$ can give values out of range or not
+    integers (if \f$\alpha\f$ is float), we use cv::saturate_cast to make sure the
+    values are valid.
+
+-   Finally, we create windows and show the images, the usual way.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-display
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-display
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-display
+@end_toggle
 
 @note
     Instead of using the **for** loops to access each pixel, we could have simply used this command:
-    @code{.cpp}
-    image.convertTo(new_image, -1, alpha, beta);
-    @endcode
-    where @ref cv::Mat::convertTo would effectively perform *new_image = a*image + beta\*. However, we
-    wanted to show you how to access each pixel. In any case, both methods give the same result but
-    convertTo is more optimized and works a lot faster.
+
+@add_toggle_cpp
+@code{.cpp}
+image.convertTo(new_image, -1, alpha, beta);
+@endcode
+@end_toggle
+
+@add_toggle_java
+@code{.java}
+image.convertTo(newImage, -1, alpha, beta);
+@endcode
+@end_toggle
+
+@add_toggle_python
+@code{.py}
+new_image = cv.convertScaleAbs(image, alpha=alpha, beta=beta)
+@endcode
+@end_toggle
+
+where @ref cv::Mat::convertTo would effectively perform *new_image = a*image + beta\*. However, we
+wanted to show you how to access each pixel. In any case, both methods give the same result but
+convertTo is more optimized and works a lot faster.
 
 Result
 ------
@@ -185,10 +280,31 @@ and are not intended to be used as a replacement of a raster graphics editor!**
 
 ### Code
 
+@add_toggle_cpp
 Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp).
+@end_toggle
+
+@add_toggle_java
+Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java).
+@end_toggle
+
+@add_toggle_python
+Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py).
+@end_toggle
+
 Code for the gamma correction:
 
-@snippet changing_contrast_brightness_image.cpp changing-contrast-brightness-gamma-correction
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp changing-contrast-brightness-gamma-correction
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java changing-contrast-brightness-gamma-correction
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py changing-contrast-brightness-gamma-correction
+@end_toggle
 
 A look-up table is used to improve the performance of the computation as only 256 values needs to be calculated once.
 
diff --git a/doc/tutorials/core/mat_operations.markdown b/doc/tutorials/core/mat_operations.markdown
index 136ce1d777..c2e7d1ca6a 100644
--- a/doc/tutorials/core/mat_operations.markdown
+++ b/doc/tutorials/core/mat_operations.markdown
@@ -7,25 +7,50 @@ Input/Output
 ### Images
 
 Load an image from a file:
-@code{.cpp}
-    Mat img = imread(filename)
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Load an image from a file
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Load an image from a file
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Load an image from a file
+@end_toggle
 
 If you read a jpg file, a 3 channel image is created by default. If you need a grayscale image, use:
 
-@code{.cpp}
-    Mat img = imread(filename, IMREAD_GRAYSCALE);
-@endcode
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Load an image from a file in grayscale
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Load an image from a file in grayscale
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Load an image from a file in grayscale
+@end_toggle
+
+@note Format of the file is determined by its content (first few bytes). To save an image to a file:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Save image
+@end_toggle
 
-@note format of the file is determined by its content (first few bytes) Save an image to a file:
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Save image
+@end_toggle
 
-@code{.cpp}
-    imwrite(filename, img);
-@endcode
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Save image
+@end_toggle
 
-@note format of the file is determined by its extension.
+@note Format of the file is determined by its extension.
 
-@note use imdecode and imencode to read and write image from/to memory rather than a file.
+@note Use cv::imdecode and cv::imencode to read and write an image from/to memory rather than a file.
 
 Basic operations with images
 ----------------------------
@@ -35,49 +60,65 @@ Basic operations with images
 In order to get pixel intensity value, you have to know the type of an image and the number of
 channels. Here is an example for a single channel grey scale image (type 8UC1) and pixel coordinates
 x and y:
-@code{.cpp}
-    Scalar intensity = img.at<uchar>(y, x);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Pixel access 1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 1
+@end_toggle
+
+C++ version only:
 intensity.val[0] contains a value from 0 to 255. Note the ordering of x and y. Since in OpenCV
 images are represented by the same structure as matrices, we use the same convention for both
 cases - the 0-based row index (or y-coordinate) goes first and the 0-based column index (or
-x-coordinate) follows it. Alternatively, you can use the following notation:
-@code{.cpp}
-    Scalar intensity = img.at<uchar>(Point(x, y));
-@endcode
+x-coordinate) follows it. Alternatively, you can use the following notation (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 2
+
 Now let us consider a 3 channel image with BGR color ordering (the default format returned by
 imread):
-@code{.cpp}
-    Vec3b intensity = img.at<Vec3b>(y, x);
-    uchar blue = intensity.val[0];
-    uchar green = intensity.val[1];
-    uchar red = intensity.val[2];
-@endcode
+
+**C++ code**
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 3
+
+**Python Python**
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 3
+
 You can use the same method for floating-point images (for example, you can get such an image by
-running Sobel on a 3 channel image):
-@code{.cpp}
-    Vec3f intensity = img.at<Vec3f>(y, x);
-    float blue = intensity.val[0];
-    float green = intensity.val[1];
-    float red = intensity.val[2];
-@endcode
+running Sobel on a 3 channel image) (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 4
+
 The same method can be used to change pixel intensities:
-@code{.cpp}
-    img.at<uchar>(y, x) = 128;
-@endcode
-There are functions in OpenCV, especially from calib3d module, such as projectPoints, that take an
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 5
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Pixel access 5
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 5
+@end_toggle
+
+There are functions in OpenCV, especially from calib3d module, such as cv::projectPoints, that take an
 array of 2D or 3D points in the form of Mat. Matrix should contain exactly one column, each row
 corresponds to a point, matrix type should be 32FC2 or 32FC3 correspondingly. Such a matrix can be
-easily constructed from `std::vector`:
-@code{.cpp}
-    vector<Point2f> points;
-    //... fill the array
-    Mat pointsMat = Mat(points);
-@endcode
-One can access a point in this matrix using the same method Mat::at :
-@code{.cpp}
-Point2f point = pointsMat.at<Point2f>(i, 0);
-@endcode
+easily constructed from `std::vector` (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Mat from points vector
+
+One can access a point in this matrix using the same method `Mat::at` (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Point access
 
 ### Memory management and reference counting
 
@@ -85,91 +126,141 @@ Mat is a structure that keeps matrix/image characteristics (rows and columns num
 and a pointer to data. So nothing prevents us from having several instances of Mat corresponding to
 the same data. A Mat keeps a reference count that tells if data has to be deallocated when a
 particular instance of Mat is destroyed. Here is an example of creating two matrices without copying
-data:
-@code{.cpp}
-    std::vector<Point3f> points;
-    // .. fill the array
-    Mat pointsMat = Mat(points).reshape(1);
-@endcode
-As a result we get a 32FC1 matrix with 3 columns instead of 32FC3 matrix with 1 column. pointsMat
+data (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 1
+
+As a result, we get a 32FC1 matrix with 3 columns instead of 32FC3 matrix with 1 column. `pointsMat`
 uses data from points and will not deallocate the memory when destroyed. In this particular
-instance, however, developer has to make sure that lifetime of points is longer than of pointsMat.
+instance, however, developer has to make sure that lifetime of `points` is longer than of `pointsMat`
 If we need to copy the data, this is done using, for example, cv::Mat::copyTo or cv::Mat::clone:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat img1 = img.clone();
-@endcode
-To the contrary with C API where an output image had to be created by developer, an empty output Mat
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Reference counting 2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Reference counting 2
+@end_toggle
+
+To the contrary with C API where an output image had to be created by the developer, an empty output Mat
 can be supplied to each function. Each implementation calls Mat::create for a destination matrix.
 This method allocates data for a matrix if it is empty. If it is not empty and has the correct size
-and type, the method does nothing. If, however, size or type are different from input arguments, the
+and type, the method does nothing. If however, size or type are different from the input arguments, the
 data is deallocated (and lost) and a new data is allocated. For example:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat sobelx;
-    Sobel(img, sobelx, CV_32F, 1, 0);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 3
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Reference counting 3
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Reference counting 3
+@end_toggle
 
 ### Primitive operations
 
 There is a number of convenient operators defined on a matrix. For example, here is how we can make
-a black image from an existing greyscale image \`img\`:
-@code{.cpp}
-    img = Scalar(0);
-@endcode
+a black image from an existing greyscale image `img`
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Set image to black
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Set image to black
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Set image to black
+@end_toggle
+
 Selecting a region of interest:
-@code{.cpp}
-    Rect r(10, 10, 100, 100);
-    Mat smallImg = img(r);
-@endcode
-A conversion from Mat to C API data structures:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    IplImage img1 = img;
-    CvMat m = img;
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Select ROI
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Select ROI
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Select ROI
+@end_toggle
+
+A conversion from Mat to C API data structures (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp C-API conversion
 
 Note that there is no data copying here.
 
-Conversion from color to grey scale:
-@code{.cpp}
-    Mat img = imread("image.jpg"); // loading a 8UC3 image
-    Mat grey;
-    cvtColor(img, grey, COLOR_BGR2GRAY);
-@endcode
+Conversion from color to greyscale:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp BGR to Gray
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java BGR to Gray
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py BGR to Gray
+@end_toggle
+
 Change image type from 8UC1 to 32FC1:
-@code{.cpp}
-    src.convertTo(dst, CV_32F);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Convert to CV_32F
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Convert to CV_32F
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Convert to CV_32F
+@end_toggle
 
 ### Visualizing images
 
 It is very useful to see intermediate results of your algorithm during development process. OpenCV
 provides a convenient way of visualizing images. A 8U image can be shown using:
-@code{.cpp}
-    Mat img = imread("image.jpg");
 
-    namedWindow("image", WINDOW_AUTOSIZE);
-    imshow("image", img);
-    waitKey();
-@endcode
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp imshow 1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java imshow 1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py imshow 1
+@end_toggle
 
 A call to waitKey() starts a message passing cycle that waits for a key stroke in the "image"
 window. A 32F image needs to be converted to 8U type. For example:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat grey;
-    cvtColor(img, grey, COLOR_BGR2GRAY);
-
-    Mat sobelx;
-    Sobel(grey, sobelx, CV_32F, 1, 0);
-
-    double minVal, maxVal;
-    minMaxLoc(sobelx, &minVal, &maxVal); //find minimum and maximum intensities
-    Mat draw;
-    sobelx.convertTo(draw, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
-
-    namedWindow("image", WINDOW_AUTOSIZE);
-    imshow("image", draw);
-    waitKey();
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp imshow 2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java imshow 2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py imshow 2
+@end_toggle
+
+@note Here cv::namedWindow is not necessary since it is immediately followed by cv::imshow.
+Nevertheless, it can be used to change the window properties or when using cv::createTrackbar
diff --git a/doc/tutorials/core/table_of_content_core.markdown b/doc/tutorials/core/table_of_content_core.markdown
index 0a1fb5614c..8db254944a 100644
--- a/doc/tutorials/core/table_of_content_core.markdown
+++ b/doc/tutorials/core/table_of_content_core.markdown
@@ -36,6 +36,10 @@ understanding how to manipulate the images on a pixel level.
 
 -   @subpage tutorial_mat_operations
 
+    *Languages:* C++, Java, Python
+
+    *Compatibility:* \> OpenCV 2.0
+
     Reading/writing images from file, accessing pixels, primitive operations, visualizing images.
 
 -   @subpage tutorial_adding_images
@@ -50,6 +54,8 @@ understanding how to manipulate the images on a pixel level.
 
 -   @subpage tutorial_basic_linear_transform
 
+    *Languages:* C++, Java, Python
+
     *Compatibility:* \> OpenCV 2.0
 
     *Author:* Ana Huamán
diff --git a/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown b/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
index d1fe50b8dd..63ab1eeb1a 100644
--- a/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
+++ b/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
@@ -91,37 +91,112 @@ __Find the eigenvectors and eigenvalues of the covariance matrix__
 Source Code
 -----------
 
-This tutorial code's is shown lines below. You can also download it from
-    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp).
-@include cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
+@end_toggle
 
 @note Another example using PCA for dimensionality reduction while maintaining an amount of variance can be found at [opencv_source_code/samples/cpp/pca.cpp](https://github.com/opencv/opencv/tree/master/samples/cpp/pca.cpp)
 
 Explanation
 -----------
 
--#  __Read image and convert it to binary__
+-   __Read image and convert it to binary__
+
+Here we apply the necessary pre-processing procedures in order to be able to detect the objects of interest.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pre-process
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java pre-process
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py pre-process
+@end_toggle
+
+-   __Extract objects of interest__
+
+Then find and filter contours by size and obtain the orientation of the remaining ones.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp contours
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java contours
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py contours
+@end_toggle
+
+-   __Extract orientation__
+
+Orientation is extracted by the call of getOrientation() function, which performs all the PCA procedure.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pca
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java pca
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py pca
+@end_toggle
+
+First the data need to be arranged in a matrix with size n x 2, where n is the number of data points we have. Then we can perform that PCA analysis. The calculated mean (i.e. center of mass) is stored in the _cntr_ variable and the eigenvectors and eigenvalues are stored in the corresponding std::vector’s.
 
-    Here we apply the necessary pre-processing procedures in order to be able to detect the objects of interest.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pre-process
+-   __Visualize result__
 
--#  __Extract objects of interest__
+The final result is visualized through the drawAxis() function, where the principal components are drawn in lines, and each eigenvector is multiplied by its eigenvalue and translated to the mean position.
 
-    Then find and filter contours by size and obtain the orientation of the remaining ones.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp contours
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization
+@end_toggle
 
--#  __Extract orientation__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java visualization
+@end_toggle
 
-    Orientation is extracted by the call of getOrientation() function, which performs all the PCA procedure.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pca
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py visualization
+@end_toggle
 
-    First the data need to be arranged in a matrix with size n x 2, where n is the number of data points we have. Then we can perform that PCA analysis. The calculated mean (i.e. center of mass) is stored in the _cntr_ variable and the eigenvectors and eigenvalues are stored in the corresponding std::vector’s.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization1
+@end_toggle
 
--#  __Visualize result__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java visualization1
+@end_toggle
 
-    The final result is visualized through the drawAxis() function, where the principal components are drawn in lines, and each eigenvector is multiplied by its eigenvalue and translated to the mean position.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization1
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py visualization1
+@end_toggle
 
 Results
 -------
diff --git a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown b/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
index 2925da9942..f1a2261e37 100644
--- a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
+++ b/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
@@ -96,25 +96,67 @@ Source Code
 
 @note The following code has been implemented with OpenCV 3.0 classes and functions. An equivalent version of the code using OpenCV 2.4 can be found in [this page.](http://docs.opencv.org/2.4/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.html#introductiontosvms)
 
-@include cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
+@end_toggle
 
 Explanation
 -----------
 
--#  **Set up the training data**
+-   **Set up the training data**
+
+The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
+two different classes; one of the classes consists of one point and the other of three points.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup1
+@end_toggle
 
-    The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-    two different classes; one of the classes consists of one point and the other of three points.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup1
+The function @ref cv::ml::SVM::train that will be used afterwards requires the training data to be
+stored as @ref cv::Mat objects of floats. Therefore, we create these objects from the arrays
+defined above:
 
-    The function @ref cv::ml::SVM::train that will be used afterwards requires the training data to be
-    stored as @ref cv::Mat objects of floats. Therefore, we create these objects from the arrays
-    defined above:
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup2
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup2
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup2
+@end_toggle
 
--#  **Set up SVM's parameters**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
+@end_toggle
+
+-   **Set up SVM's parameters**
 
     In this tutorial we have introduced the theory of SVMs in the most simple case, when the
     training examples are spread into two classes that are linearly separable. However, SVMs can be
@@ -123,35 +165,55 @@ Explanation
     we have to define some parameters before training the SVM. These parameters are stored in an
     object of the class @ref cv::ml::SVM.
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp init
-
-    Here:
-    -   *Type of SVM*. We choose here the type @ref cv::ml::SVM::C_SVC "C_SVC" that can be used for
-        n-class classification (n \f$\geq\f$ 2). The important feature of this type is that it deals
-        with imperfect separation of classes (i.e. when the training data is non-linearly separable).
-        This feature is not important here since the data is linearly separable and we chose this SVM
-        type only for being the most commonly used.
-
-    -   *Type of SVM kernel*. We have not talked about kernel functions since they are not
-        interesting for the training data we are dealing with. Nevertheless, let's explain briefly now
-        the main idea behind a kernel function. It is a mapping done to the training data to improve
-        its resemblance to a linearly separable set of data. This mapping consists of increasing the
-        dimensionality of the data and is done efficiently using a kernel function. We choose here the
-        type @ref cv::ml::SVM::LINEAR "LINEAR" which means that no mapping is done. This parameter is
-        defined using cv::ml::SVM::setKernel.
-
-    -   *Termination criteria of the algorithm*. The SVM training procedure is implemented solving a
-        constrained quadratic optimization problem in an **iterative** fashion. Here we specify a
-        maximum number of iterations and a tolerance error so we allow the algorithm to finish in
-        less number of steps even if the optimal hyperplane has not been computed yet. This
-        parameter is defined in a structure @ref cv::TermCriteria .
-
--#  **Train the SVM**
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp init
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java init
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py init
+@end_toggle
+
+Here:
+-   *Type of SVM*. We choose here the type @ref cv::ml::SVM::C_SVC "C_SVC" that can be used for
+    n-class classification (n \f$\geq\f$ 2). The important feature of this type is that it deals
+    with imperfect separation of classes (i.e. when the training data is non-linearly separable).
+    This feature is not important here since the data is linearly separable and we chose this SVM
+    type only for being the most commonly used.
+
+-   *Type of SVM kernel*. We have not talked about kernel functions since they are not
+    interesting for the training data we are dealing with. Nevertheless, let's explain briefly now
+    the main idea behind a kernel function. It is a mapping done to the training data to improve
+    its resemblance to a linearly separable set of data. This mapping consists of increasing the
+    dimensionality of the data and is done efficiently using a kernel function. We choose here the
+    type @ref cv::ml::SVM::LINEAR "LINEAR" which means that no mapping is done. This parameter is
+    defined using cv::ml::SVM::setKernel.
+
+-   *Termination criteria of the algorithm*. The SVM training procedure is implemented solving a
+    constrained quadratic optimization problem in an **iterative** fashion. Here we specify a
+    maximum number of iterations and a tolerance error so we allow the algorithm to finish in
+    less number of steps even if the optimal hyperplane has not been computed yet. This
+    parameter is defined in a structure @ref cv::TermCriteria .
+
+-   **Train the SVM**
     We call the method @ref cv::ml::SVM::train to build the SVM model.
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp train
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp train
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java train
+@end_toggle
 
--#  **Regions classified by the SVM**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py train
+@end_toggle
+
+-   **Regions classified by the SVM**
 
     The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
     this example we have used this method in order to color the space depending on the prediction done
@@ -159,16 +221,36 @@ Explanation
     Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
     green if it is the class with label 1 and in blue if it is the class with label -1.
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show
+@end_toggle
 
--#  **Support vectors**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show
+@end_toggle
+
+-   **Support vectors**
 
     We use here a couple of methods to obtain information about the support vectors.
     The method @ref cv::ml::SVM::getSupportVectors obtain all of the support
     vectors. We have used this methods here to find the training examples that are
     support vectors and highlight them.
 
-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show_vectors
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show_vectors
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show_vectors
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show_vectors
+@end_toggle
 
 Results
 -------
diff --git a/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown b/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
index f98cd63639..e03c75b62f 100644
--- a/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
+++ b/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
@@ -92,81 +92,175 @@ You may also find the source code in `samples/cpp/tutorial_code/ml/non_linear_sv
 @note The following code has been implemented with OpenCV 3.0 classes and functions. An equivalent version of the code
 using OpenCV 2.4 can be found in [this page.](http://docs.opencv.org/2.4/doc/tutorials/ml/non_linear_svms/non_linear_svms.html#nonlinearsvms)
 
-@include cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
+@end_toggle
 
 Explanation
 -----------
 
--#  __Set up the training data__
+-   __Set up the training data__
+
+The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
+two different classes. To make the exercise more appealing, the training data is generated
+randomly using a uniform probability density functions (PDFs).
+
+We have divided the generation of the training data into two main parts.
+
+In the first part we generate data for both classes that is linearly separable.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup1
+@end_toggle
+
+In the second part we create data for both classes that is non-linearly separable, data that
+overlaps.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup2
+@end_toggle
+
+-   __Set up SVM's parameters__
+
+@note In the previous tutorial @ref tutorial_introduction_to_svm there is an explanation of the
+attributes of the class @ref cv::ml::SVM that we configure here before training the SVM.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp init
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java init
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py init
+@end_toggle
+
+There are just two differences between the configuration we do here and the one that was done in
+the previous tutorial (@ref tutorial_introduction_to_svm) that we use as reference.
 
-    The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-    two different classes. To make the exercise more appealing, the training data is generated
-    randomly using a uniform probability density functions (PDFs).
+-   _C_. We chose here a small value of this parameter in order not to punish too much the
+    misclassification errors in the optimization. The idea of doing this stems from the will of
+    obtaining a solution close to the one intuitively expected. However, we recommend to get a
+    better insight of the problem by making adjustments to this parameter.
 
-    We have divided the generation of the training data into two main parts.
+    @note In this case there are just very few points in the overlapping region between classes.
+    By giving a smaller value to __FRAC_LINEAR_SEP__ the density of points can be incremented and the
+    impact of the parameter _C_ explored deeply.
 
-    In the first part we generate data for both classes that is linearly separable.
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup1
+-   _Termination Criteria of the algorithm_. The maximum number of iterations has to be
+    increased considerably in order to solve correctly a problem with non-linearly separable
+    training data. In particular, we have increased in five orders of magnitude this value.
 
-    In the second part we create data for both classes that is non-linearly separable, data that
-    overlaps.
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup2
+-   __Train the SVM__
 
--#  __Set up SVM's parameters__
+We call the method @ref cv::ml::SVM::train to build the SVM model. Watch out that the training
+process may take a quite long time. Have patiance when your run the program.
 
-    @note In the previous tutorial @ref tutorial_introduction_to_svm there is an explanation of the
-    attributes of the class @ref cv::ml::SVM that we configure here before training the SVM.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp train
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp init
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java train
+@end_toggle
 
-    There are just two differences between the configuration we do here and the one that was done in
-    the previous tutorial (@ref tutorial_introduction_to_svm) that we use as reference.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py train
+@end_toggle
 
-    -   _C_. We chose here a small value of this parameter in order not to punish too much the
-        misclassification errors in the optimization. The idea of doing this stems from the will of
-        obtaining a solution close to the one intuitively expected. However, we recommend to get a
-        better insight of the problem by making adjustments to this parameter.
+-   __Show the Decision Regions__
 
-        @note In this case there are just very few points in the overlapping region between classes.
-        By giving a smaller value to __FRAC_LINEAR_SEP__ the density of points can be incremented and the
-        impact of the parameter _C_ explored deeply.
+The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
+this example we have used this method in order to color the space depending on the prediction done
+by the SVM. In other words, an image is traversed interpreting its pixels as points of the
+Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
+dark green if it is the class with label 1 and in dark blue if it is the class with label 2.
 
-    -   _Termination Criteria of the algorithm_. The maximum number of iterations has to be
-        increased considerably in order to solve correctly a problem with non-linearly separable
-        training data. In particular, we have increased in five orders of magnitude this value.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show
+@end_toggle
 
--#  __Train the SVM__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show
+@end_toggle
 
-    We call the method @ref cv::ml::SVM::train to build the SVM model. Watch out that the training
-    process may take a quite long time. Have patiance when your run the program.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp train
+-   __Show the training data__
 
--#  __Show the Decision Regions__
+The method @ref cv::circle is used to show the samples that compose the training data. The samples
+of the class labeled with 1 are shown in light green and in light blue the samples of the class
+labeled with 2.
 
-    The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
-    this example we have used this method in order to color the space depending on the prediction done
-    by the SVM. In other words, an image is traversed interpreting its pixels as points of the
-    Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
-    dark green if it is the class with label 1 and in dark blue if it is the class with label 2.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_data
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_data
+@end_toggle
 
--#  __Show the training data__
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_data
+@end_toggle
 
-    The method @ref cv::circle is used to show the samples that compose the training data. The samples
-    of the class labeled with 1 are shown in light green and in light blue the samples of the class
-    labeled with 2.
+-   __Support vectors__
 
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_data
+We use here a couple of methods to obtain information about the support vectors. The method
+@ref cv::ml::SVM::getSupportVectors obtain all support vectors. We have used this methods here
+to find the training examples that are support vectors and highlight them.
 
--#  __Support vectors__
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_vectors
+@end_toggle
 
-    We use here a couple of methods to obtain information about the support vectors. The method
-    @ref cv::ml::SVM::getSupportVectors obtain all support vectors. We have used this methods here
-    to find the training examples that are support vectors and highlight them.
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_vectors
+@end_toggle
 
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_vectors
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_vectors
+@end_toggle
 
 Results
 -------
diff --git a/doc/tutorials/ml/table_of_content_ml.markdown b/doc/tutorials/ml/table_of_content_ml.markdown
index 4e51fe55b8..b4064777a2 100644
--- a/doc/tutorials/ml/table_of_content_ml.markdown
+++ b/doc/tutorials/ml/table_of_content_ml.markdown
@@ -6,6 +6,8 @@ of data.
 
 -   @subpage tutorial_introduction_to_svm
 
+    *Languages:* C++, Java, Python
+
     *Compatibility:* \> OpenCV 2.0
 
     *Author:* Fernando Iglesias García
@@ -14,6 +16,8 @@ of data.
 
 -   @subpage tutorial_non_linear_svms
 
+    *Languages:* C++, Java, Python
+
     *Compatibility:* \> OpenCV 2.0
 
     *Author:* Fernando Iglesias García
@@ -23,6 +27,8 @@ of data.
 
 -   @subpage tutorial_introduction_to_pca
 
+    *Languages:* C++, Java, Python
+
     *Compatibility:* \> OpenCV 2.0
 
     *Author:* Theodore Tsesmelis
diff --git a/modules/calib3d/misc/java/gen_dict.json b/modules/calib3d/misc/java/gen_dict.json
index 0e3519608b..3e69094487 100644
--- a/modules/calib3d/misc/java/gen_dict.json
+++ b/modules/calib3d/misc/java/gen_dict.json
@@ -17,6 +17,9 @@
             ]
         }
     },
+    "namespaces_dict": {
+        "cv.fisheye": "fisheye"
+    },
     "func_arg_fix" : {
         "findFundamentalMat"  : { "points1" : {"ctype" : "vector_Point2f"},
                                   "points2" : {"ctype" : "vector_Point2f"} },
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index d0f17c900a..9297bd35b1 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -513,10 +513,6 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
     {
         cvtColor(img, img, COLOR_BGR2GRAY);
     }
-    else
-    {
-        img.clone();
-    }
 
     int prev_sqr_size = 0;
 
@@ -578,6 +574,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
     {
         if (flags & CALIB_CB_NORMALIZE_IMAGE)
         {
+            img = img.clone();
             equalizeHist(img, img);
         }
 
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index 5de4db9959..022eb1ff06 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -2336,10 +2336,13 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     _uu[2] = 1;
     cvCrossProduct(&uu, &t, &ww);
     nt = cvNorm(&t, 0, CV_L2);
+    CV_Assert(fabs(nt) > 0);
     nw = cvNorm(&ww, 0, CV_L2);
+    CV_Assert(fabs(nw) > 0);
     cvConvertScale(&ww, &ww, 1 / nw);
     cvCrossProduct(&t, &ww, &w3);
     nw = cvNorm(&w3, 0, CV_L2);
+    CV_Assert(fabs(nw) > 0);
     cvConvertScale(&w3, &w3, 1 / nw);
     _uu[2] = 0;
 
@@ -3159,6 +3162,10 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
     Point3f* objPtData = objPtMat.ptr<Point3f>();
     Point2f* imgPtData1 = imgPtMat1.ptr<Point2f>();
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
     for( i = 0; i < nimages; i++, j += ni )
     {
         Mat objpt = objectPoints.getMat(i);
@@ -3176,6 +3183,9 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
             memcpy( imgPtData2 + j, imgpt2.ptr(), ni*sizeof(imgPtData2[0]) );
         }
     }
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 }
 
 static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
@@ -3870,12 +3880,14 @@ float cv::rectify3Collinear( InputArray _cameraMatrix1, InputArray _distCoeffs1,
 
     int idx = fabs(t12(0,0)) > fabs(t12(1,0)) ? 0 : 1;
     double c = t12(idx,0), nt = norm(t12, CV_L2);
+    CV_Assert(fabs(nt) > 0);
     Mat_<double> uu = Mat_<double>::zeros(3,1);
     uu(idx, 0) = c > 0 ? 1 : -1;
 
     // calculate global Z rotation
     Mat_<double> ww = t12.cross(uu), wR;
     double nw = norm(ww, CV_L2);
+    CV_Assert(fabs(nw) > 0);
     ww *= acos(fabs(c)/nt)/nw;
     Rodrigues(ww, wR);
 
diff --git a/modules/calib3d/src/dls.cpp b/modules/calib3d/src/dls.cpp
index d44c364b49..b0334c4268 100644
--- a/modules/calib3d/src/dls.cpp
+++ b/modules/calib3d/src/dls.cpp
@@ -206,6 +206,7 @@ void dls::run_kernel(const cv::Mat& pp)
 
 void dls::build_coeff_matrix(const cv::Mat& pp, cv::Mat& Mtilde, cv::Mat& D)
 {
+    CV_Assert(!pp.empty());
     cv::Mat eye = cv::Mat::eye(3, 3, CV_64F);
 
     // build coeff matrix
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 4ca6b71ca1..83a5a88c5f 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -126,7 +126,8 @@ void cv::fisheye::projectPoints(InputArray objectPoints, OutputArray imagePoints
     {
         Vec3d Xi = objectPoints.depth() == CV_32F ? (Vec3d)Xf[i] : Xd[i];
         Vec3d Y = aff*Xi;
-
+        if (fabs(Y[2]) < DBL_MIN)
+            Y[2] = 1;
         Vec2d x(Y[0]/Y[2], Y[1]/Y[2]);
 
         double r2 = x.dot(x);
@@ -1186,6 +1187,7 @@ void cv::internal::ComputeExtrinsicRefine(const Mat& imagePoints, const Mat& obj
 {
     CV_Assert(!objectPoints.empty() && objectPoints.type() == CV_64FC3);
     CV_Assert(!imagePoints.empty() && imagePoints.type() == CV_64FC2);
+    CV_Assert(rvec.total() > 2 && tvec.total() > 2);
     Vec6d extrinsics(rvec.at<double>(0), rvec.at<double>(1), rvec.at<double>(2),
                     tvec.at<double>(0), tvec.at<double>(1), tvec.at<double>(2));
     double change = 1;
@@ -1365,9 +1367,13 @@ void cv::internal::InitExtrinsics(const Mat& _imagePoints, const Mat& _objectPoi
     double sc = .5 * (norm(H.col(0)) + norm(H.col(1)));
     H = H / sc;
     Mat u1 = H.col(0).clone();
-    u1  = u1 / norm(u1);
+    double norm_u1 = norm(u1);
+    CV_Assert(fabs(norm_u1) > 0);
+    u1  = u1 / norm_u1;
     Mat u2 = H.col(1).clone() - u1.dot(H.col(1).clone()) * u1;
-    u2 = u2 / norm(u2);
+    double norm_u2 = norm(u2);
+    CV_Assert(fabs(norm_u2) > 0);
+    u2 = u2 / norm_u2;
     Mat u3 = u1.cross(u2);
     Mat RRR;
     hconcat(u1, u2, RRR);
diff --git a/modules/calib3d/src/homography_decomp.cpp b/modules/calib3d/src/homography_decomp.cpp
index 6975a7ef11..fea8882c5a 100644
--- a/modules/calib3d/src/homography_decomp.cpp
+++ b/modules/calib3d/src/homography_decomp.cpp
@@ -194,6 +194,7 @@ void HomographyDecompZhang::decompose(std::vector<CameraMotion>& camMotions)
 {
     Mat W, U, Vt;
     SVD::compute(getHnorm(), W, U, Vt);
+    CV_Assert(W.total() > 2 && Vt.total() > 7);
     double lambda1=W.at<double>(0);
     double lambda3=W.at<double>(2);
     double lambda1m3 =  (lambda1-lambda3);
diff --git a/modules/calib3d/test/test_cameracalibration_badarg.cpp b/modules/calib3d/test/test_cameracalibration_badarg.cpp
index f9b19a8668..b63d4b4cfd 100644
--- a/modules/calib3d/test/test_cameracalibration_badarg.cpp
+++ b/modules/calib3d/test/test_cameracalibration_badarg.cpp
@@ -489,7 +489,14 @@ protected:
     void run(int /* start_from */ )
     {
         CvMat zeros;
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
         memset(&zeros, 0, sizeof(zeros));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 
         C_Caller caller, bad_caller;
         CvMat objectPoints_c, r_vec_c, t_vec_c, A_c, distCoeffs_c, imagePoints_c,
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 3dcf1b6d86..1697ded3a6 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -1981,10 +1981,20 @@ CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                              OutputArray eigenvectors, int maxComponents = 0);
 
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           int maxComponents = 0);
+
 /** wrap PCA::operator() */
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                              OutputArray eigenvectors, double retainedVariance);
 
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           double retainedVariance);
+
 /** wrap PCA::project */
 CV_EXPORTS_W void PCAProject(InputArray data, InputArray mean,
                              InputArray eigenvectors, OutputArray result);
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index f96df7c9a3..ae497d94d9 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -406,6 +406,24 @@ Cv64suf;
 #endif
 
 
+/****************************************************************************************\
+*                                  CV_NODISCARD attribute                                *
+* encourages the compiler to issue a warning if the return value is discarded (C++17)    *
+\****************************************************************************************/
+#ifndef CV_NODISCARD
+#  if defined(__GNUC__)
+#    define CV_NODISCARD __attribute__((__warn_unused_result__)) // at least available with GCC 3.4
+#  elif defined(__clang__) && defined(__has_attribute)
+#    if __has_attribute(__warn_unused_result__)
+#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#    endif
+#  endif
+#endif
+#ifndef CV_NODISCARD
+#  define CV_NODISCARD /* nothing by default */
+#endif
+
+
 /****************************************************************************************\
 *                                    C++ 11                                              *
 \****************************************************************************************/
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 9dcfc5623a..031f8f3d02 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -60,255 +60,72 @@
 // access from within opencv code more accessible
 namespace cv {
 
-#ifndef CV_DOXYGEN
-
-#ifdef CV_CPU_DISPATCH_MODE
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#else
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#endif
-
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-#endif
-
-//! @addtogroup core_hal_intrin
-//! @{
-
-//! @cond IGNORED
 template<typename _Tp> struct V_TypeTraits
 {
-    typedef _Tp int_type;
-    typedef _Tp uint_type;
-    typedef _Tp abs_type;
-    typedef _Tp sum_type;
-
-    enum { delta = 0, shift = 0 };
-
-    static int_type reinterpret_int(_Tp x) { return x; }
-    static uint_type reinterpet_uint(_Tp x) { return x; }
-    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
-};
-
-template<> struct V_TypeTraits<uchar>
-{
-    typedef uchar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef ushort w_type;
-    typedef unsigned q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<schar>
-{
-    typedef schar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef short w_type;
-    typedef int q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<ushort>
-{
-    typedef ushort value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef unsigned w_type;
-    typedef uchar nu_type;
-
-    enum { delta = 32768, shift = 16 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<short>
-{
-    typedef short value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef int w_type;
-    typedef uchar nu_type;
-    typedef schar n_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<unsigned>
-{
-    typedef unsigned value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef unsigned sum_type;
-
-    typedef uint64 w_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int>
-{
-    typedef int value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef int sum_type;
-
-    typedef int64 w_type;
-    typedef short n_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<uint64>
-{
-    typedef uint64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef uint64 sum_type;
-
-    typedef unsigned nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int64>
-{
-    typedef int64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef int64 sum_type;
-
-    typedef int nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-
-template<> struct V_TypeTraits<float>
-{
-    typedef float value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef float abs_type;
-    typedef float sum_type;
-
-    typedef double w_type;
-
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.u;
-    }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv32suf u;
-        u.i = x;
-        return u.f;
-    }
 };
 
-template<> struct V_TypeTraits<double>
-{
-    typedef double value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef double abs_type;
-    typedef double sum_type;
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.u;
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+        enum { nlanes128 = nlanes128_ }; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
     }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv64suf u;
-        u.i = x;
-        return u.f;
-    }
-};
 
-template <typename T> struct V_SIMD128Traits
-{
-    enum { nlanes = 16 / sizeof(T) };
-};
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
 
-//! @endcond
+#ifndef CV_DOXYGEN
 
-//! @}
+#ifdef CV_CPU_DISPATCH_MODE
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
 
-#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
 }
 
 #ifdef CV_DOXYGEN
+#   undef CV_AVX2
 #   undef CV_SSE2
 #   undef CV_NEON
 #   undef CV_VSX
+#   undef CV_FP16
 #endif
 
 #if CV_SSE2
@@ -325,27 +142,25 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 #else
 
+#define CV_SIMD128_CPP 1
 #include "opencv2/core/hal/intrin_cpp.hpp"
 
 #endif
 
-//! @addtogroup core_hal_intrin
-//! @{
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+#if CV_AVX2
 
-#ifndef CV_SIMD128
-//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
-#define CV_SIMD128 0
-#endif
+#include "opencv2/core/hal/intrin_avx.hpp"
 
-#ifndef CV_SIMD128_64F
-//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
-#define CV_SIMD128_64F 0
 #endif
 
-//! @}
-
-//==================================================================================================
-
 //! @cond IGNORED
 
 namespace cv {
@@ -354,88 +169,175 @@ namespace cv {
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #endif
 
-template <typename R> struct V_RegTrait128;
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
 
-template <> struct V_RegTrait128<uchar> {
-    typedef v_uint8x16 reg;
-    typedef v_uint16x8 w_reg;
-    typedef v_uint32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_uint8x16 zero() { return v_setzero_u8(); }
-    static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
-};
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
 
-template <> struct V_RegTrait128<schar> {
-    typedef v_int8x16 reg;
-    typedef v_int16x8 w_reg;
-    typedef v_int32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_int8x16 zero() { return v_setzero_s8(); }
-    static v_int8x16 all(schar val) { return v_setall_s8(val); }
-};
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
 
-template <> struct V_RegTrait128<ushort> {
-    typedef v_uint16x8 reg;
-    typedef v_uint32x4 w_reg;
-    typedef v_int16x8 int_reg;
-    typedef v_uint16x8 u_reg;
-    static v_uint16x8 zero() { return v_setzero_u16(); }
-    static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
-};
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
 
-template <> struct V_RegTrait128<short> {
-    typedef v_int16x8 reg;
-    typedef v_int32x4 w_reg;
-    typedef v_uint16x8 u_reg;
-    static v_int16x8 zero() { return v_setzero_s16(); }
-    static v_int16x8 all(short val) { return v_setall_s16(val); }
-};
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
 
-template <> struct V_RegTrait128<unsigned> {
-    typedef v_uint32x4 reg;
-    typedef v_uint64x2 w_reg;
-    typedef v_int32x4 int_reg;
-    typedef v_uint32x4 u_reg;
-    static v_uint32x4 zero() { return v_setzero_u32(); }
-    static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
-};
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
 
-template <> struct V_RegTrait128<int> {
-    typedef v_int32x4 reg;
-    typedef v_int64x2 w_reg;
-    typedef v_uint32x4 u_reg;
-    static v_int32x4 zero() { return v_setzero_s32(); }
-    static v_int32x4 all(int val) { return v_setall_s32(val); }
-};
+#if CV_SIMD512
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+#elif CV_SIMD256
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_WIDTH 32
+#else
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+#endif
 
-template <> struct V_RegTrait128<uint64> {
-    typedef v_uint64x2 reg;
-    static v_uint64x2 zero() { return v_setzero_u64(); }
-    static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
-};
+//==================================================================================================
 
-template <> struct V_RegTrait128<int64> {
-    typedef v_int64x2 reg;
-    static v_int64x2 zero() { return v_setzero_s64(); }
-    static v_int64x2 all(int64 val) { return v_setall_s64(val); }
+#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
+    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
+    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
+    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
+    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
+inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+
+template<typename _Tp> struct V_RegTraits
+{
 };
 
-template <> struct V_RegTrait128<float> {
-    typedef v_float32x4 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float32x4 u_reg;
-    static v_float32x4 zero() { return v_setzero_f32(); }
-    static v_float32x4 all(float val) { return v_setall_f32(val); }
-};
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }
 
+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
 #if CV_SIMD128_64F
-template <> struct V_RegTrait128<double> {
-    typedef v_float64x2 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float64x2 u_reg;
-    static v_float64x2 zero() { return v_setzero_f64(); }
-    static v_float64x2 all(double val) { return v_setall_f64(val); }
-};
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
+#endif
+#endif
+
+#if CV_SIMD256
+    typedef v_uint8x32   v_uint8;
+    typedef v_int8x32    v_int8;
+    typedef v_uint16x16  v_uint16;
+    typedef v_int16x16   v_int16;
+    typedef v_uint32x8   v_uint32;
+    typedef v_int32x8    v_int32;
+    typedef v_uint64x4   v_uint64;
+    typedef v_int64x4    v_int64;
+    typedef v_float32x8  v_float32;
+    #if CV_SIMD256_64F
+    typedef v_float64x4  v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x16  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
+    inline void vx_cleanup() { v256_cleanup(); }
+#elif CV_SIMD128
+    typedef v_uint8x16  v_uint8;
+    typedef v_int8x16   v_int8;
+    typedef v_uint16x8  v_uint16;
+    typedef v_int16x8   v_int16;
+    typedef v_uint32x4  v_uint32;
+    typedef v_int32x4   v_int32;
+    typedef v_uint64x2  v_uint64;
+    typedef v_int64x2   v_int64;
+    typedef v_float32x4 v_float32;
+    #if CV_SIMD128_64F
+    typedef v_float64x2 v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x8  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
+    #if CV_SIMD128_64F
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
+    #endif
+    inline void vx_cleanup() { v_cleanup(); }
 #endif
 
 inline unsigned int trailingZeros32(unsigned int value) {
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
new file mode 100644
index 0000000000..7e983fd24f
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -0,0 +1,2016 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX_HPP
+#define OPENCV_HAL_INTRIN_AVX_HPP
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
+
+inline int _v_cvtsi256_si32(const __m256i& a)
+{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return _mm256_permute2x128_si256(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return _mm256_permute2f128_ps(a, b, imm); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return _mm256_permute2f128_pd(a, b, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return _mm256_permute4x64_epi64(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return _mm256_permute4x64_pd(a, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ return _mm256_extracti128_si256(v, 1); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return _mm256_extractf128_ps(v, 1); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return _mm256_extractf128_pd(v, 1); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return _mm256_castsi256_si128(v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return _mm256_castps256_ps128(v); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return _mm256_castpd256_pd128(v); }
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    v_uint8x32() : val(_mm256_setzero_si256()) {}
+    uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    v_int8x32() : val(_mm256_setzero_si256()) {}
+    schar get0() const { return (schar)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    v_uint16x16() : val(_mm256_setzero_si256()) {}
+    ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int16x16() : val(_mm256_setzero_si256()) {}
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    v_uint32x8() : val(_mm256_setzero_si256()) {}
+    unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    v_int32x8() : val(_mm256_setzero_si256()) {}
+    int get0() const { return _v_cvtsi256_si32(val); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    v_float32x8() : val(_mm256_setzero_ps()) {}
+    float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    v_uint64x4() : val(_mm256_setzero_si256()) {}
+    uint64 get0() const
+    { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
+    v_int64x4() : val(_mm256_setzero_si256()) {}
+    int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _mm256_setr_pd(v0, v1, v2, v3); }
+    v_float64x4() : val(_mm256_setzero_pd()) {}
+    double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
+};
+
+struct v_float16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_float16x16(__m256i v) : val(v) {}
+    v_float16x16(short v0, short v1, short v2, short v3,
+                 short v4, short v5, short v6, short v7,
+                 short v8, short v9, short v10, short v11,
+                 short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float16x16() : val(_mm256_setzero_si256()) {}
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
+inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \
+        return _Tpvec(_mm256_castsi128_si256(v128));                  \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \
+        __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm256_load_##suffix(ptr)); }                         \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \
+                     (_mm_loadu_##suffix(ptr)));                          \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm_loadu_##suffix(ptr0);                           \
+        halfreg vhi = _mm_loadu_##suffix(ptr1);                           \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm256_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float,  ps, __m128)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
+
+#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                        \
+    { return _Tpvec(_mm256_setzero_si256()); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
+    { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
+
+OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,  unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,   int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)
+OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
+
+#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                \
+    { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                            \
+    { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float,  f32, ps, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castpd_ps(a.val)); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_castps_pd(a.val)); }
+
+inline v_float16x16 v256_load_f16(const short* ptr)
+{ return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
+inline v_float16x16 v256_load_f16_aligned(const short* ptr)
+{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x16& a)
+{ _mm256_storeu_si256((__m256i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x16& a)
+{ _mm256_store_si256((__m256i*)ptr, a.val); }
+
+/* Recombine */
+/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(perm(a.val, b.val, 0x20)); }                     \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(perm(a.val, b.val, 0x31)); }                     \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    { c = v_combine_low(a, b); d = v_combine_high(a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \
+    OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \
+    inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \
+                             _Tpvec& b0, _Tpvec& b1)                 \
+    {                                                                \
+        __m256i v0 = _v256_shuffle_odd_64(a0.val);                   \
+        __m256i v1 = _v256_shuffle_odd_64(a1.val);                   \
+        b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \
+        b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \
+    }
+
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
+
+inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
+{
+    __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
+    __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
+    v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
+}
+
+inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
+{
+    __m256d v0 = _v_shuffle_odd_64(a0.val);
+    __m256d v1 = _v_shuffle_odd_64(a1.val);
+    b0.val = _mm256_unpacklo_pd(v0, v1);
+    b1.val = _mm256_unpackhi_pd(v0, v1);
+}*/
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
+
+// blend
+#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \
+    template<int m>                                             \
+    inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
+
+template<int m>
+inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
+{
+    enum {M0 = m};
+    enum {M1 = (M0 | (M0 << 2)) & 0x33};
+    enum {M2 = (M1 | (M1 << 1)) & 0x55};
+    enum {MM =  M2 | (M2 << 1)};
+    return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
+}
+template<int m>
+inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
+
+// shuffle
+// todo: emluate 64bit
+#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(_mm256_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v256_blend<0b11110000>(a, b); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v256_blend<0b1100>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x21>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
+
+// ZIP
+#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x20>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x31>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16,  _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(a * b, vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(a * b, vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = _mm256_mul_epu32(a.val, b.val);
+    __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(srai(a.val, imm)); }                             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                            \
+    { return _Tpsvec(srai(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)
+
+inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
+{
+    __m256i d = _mm256_set1_epi64x((int64)1 << 63);
+    __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
+    return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
+}
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+    { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }                                         \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
+    { return b > a; }                                             \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a < b); }                                          \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    {                                                                    \
+        __m256i smask = _mm256_set1_##suffix(sbit);                      \
+        return _Tpuvec(_mm256_cmpgt_##suffix(                            \
+                       _mm256_xor_si256(a.val, smask),                   \
+                       _mm256_xor_si256(b.val, smask)));                 \
+    }                                                                    \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+
+    switch(imm)
+    {
+        case 0:  return a;
+        case 32: return b;
+        case 16: return v_uint8x32(swap);
+    }
+
+    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
+    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
+
+    return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+
+    switch(imm)
+    {
+        case 0:  return a;
+        case 32: return b;
+        case 16: return v_uint8x32(swap);
+    }
+
+    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
+
+    return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    v_uint8x32 res;
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+
+    if (imm == 0)
+        return a;
+    if (imm == 16)
+        res.val = swapz;
+    else if (imm < 16)
+        res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
+    else if (imm < 32)
+        res.val = _mm256_slli_si256(swapz, imm - 16);
+    else
+        return v_uint8x32();
+    return res;
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    v_uint8x32 res;
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+
+    if (imm == 0)
+        return a;
+    if (imm == 16)
+        res.val = swapz;
+    else if (imm < 16)
+        res.val = _mm256_alignr_epi8(swapz, a.val, imm);
+    else if (imm < 32)
+        res.val = _mm256_srli_si256(swapz, imm - 16);
+    else
+        return v_uint8x32();
+    return res;
+}
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)   \
+    template<int imm>                                           \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)      \
+    {                                                           \
+        const int w = sizeof(typename _Tpvec::lane_type);       \
+        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a),  \
+                                       v_reinterpret_as_u8(b)); \
+        return _Tpvec(cast(ret.val));                           \
+    }                                                           \
+    template<int imm>                                           \
+    inline _Tpvec intrin(const _Tpvec& a)                       \
+    {                                                           \
+        const int w = sizeof(typename _Tpvec::lane_type);       \
+        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
+        return _Tpvec(cast(ret.val));                           \
+    }
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \
+        return (sctype) _mm_cvtsi128_si32(v0);                      \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  min, _mm_min_epi16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  max, _mm_max_epi16)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                 \
+    {                                                              \
+        __m128i v0 = _v256_extract_low(a.val);                     \
+        __m128i v1 = _v256_extract_high(a.val);                    \
+        v0 = intrin(v0, v1);                                       \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \
+        return (sctype) _mm_cvtsi128_si32(v0);                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      min, _mm_min_epi32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
+        return _mm_cvtss_f32(v0);                                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
+
+inline ushort v_reduce_sum(const v_uint16x16& a)
+{
+    __m128i a0 = _v256_extract_low(a.val);
+    __m128i a1 = _v256_extract_high(a.val);
+
+    __m128i s0 = _mm_adds_epu16(a0, a1);
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
+
+    return (ushort)_mm_cvtsi128_si32(s0);
+}
+
+inline short v_reduce_sum(const v_int16x16& a)
+{
+    __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
+            s0 = _mm256_hadds_epi16(s0, s0);
+            s0 = _mm256_hadds_epi16(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
+
+    return (short)_mm_cvtsi128_si32(s1);
+}
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
+            s0 = _mm256_hadd_epi32(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
+
+    return _mm_cvtsi128_si32(s1);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
+           s0 = _mm256_hadd_ps(s0, s0);
+
+    __m128 s1 = _v256_extract_high(s0);
+           s1 = _mm_add_ps(_v256_extract_low(s0), s1);
+
+    return _mm_cvtss_f32(s1);
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    __m256 ab = _mm256_hadd_ps(a.val, b.val);
+    __m256 cd = _mm256_hadd_ps(c.val, d.val);
+    return v_float32x8(_mm256_hadd_ps(ab, cd));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
+    inline v_uint32x8 v_popcount(const _Tpvec& a)                \
+    {                                                            \
+        const v_uint32x8 m1 = v256_setall_u32(0x55555555);       \
+        const v_uint32x8 m2 = v256_setall_u32(0x33333333);       \
+        const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f);       \
+        v_uint32x8 p  = v_reinterpret_as_u32(a);                 \
+        p = ((p >> 1) & m1) + (p & m1);                          \
+        p = ((p >> 2) & m2) + (p & m2);                          \
+        p = ((p >> 4) & m4) + (p & m4);                          \
+        p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256());  \
+        return p;                                                \
+    }
+
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
+
+/** Mask **/
+inline int v_signmask(const v_int8x32& a)
+{ return _mm256_movemask_epi8(a.val); }
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{
+    v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
+    return v_signmask(v) & 255;
+}
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+    __m256i a16 = _mm256_packs_epi32(a.val, a.val);
+    v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
+    return v_signmask(v) & 15;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return _mm256_movemask_ps(a.val); }
+inline int v_signmask(const v_float64x4& a)
+{ return _mm256_movemask_pd(a.val); }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
+    inline bool v_check_all(const _Tpvec& a)                \
+    {                                                       \
+        int mask = v_signmask(v_reinterpret_as_s8(a));      \
+        return and_op(mask, allmask) == allmask;            \
+    }                                                       \
+    inline bool v_check_any(const _Tpvec& a)                \
+    {                                                       \
+        int mask = v_signmask(v_reinterpret_as_s8(a));      \
+        return and_op(mask, allmask) != 0;                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32,  OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32,   OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16,  OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8,  OPENCV_HAL_AND, (int)0x8888)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8,   OPENCV_HAL_AND, (int)0x8888)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a)           \
+    {                                                  \
+        int mask = v_signmask(a);                      \
+        return mask == allmask;                        \
+    }                                                  \
+    inline bool v_check_any(const _Tpvec& a)           \
+    {                                                  \
+        int mask = v_signmask(a);                      \
+        return mask != 0;                              \
+    }
+
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
+    // todo: _mm256_fnmsub_ps
+    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+    return v256_setall_f64(1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \
+    { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub_wrap(a, b);
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
+    return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
+
+#if CV_FP16
+inline v_float32x8 v_cvt_f32(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
+
+inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
+
+inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
+{
+    __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
+    return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
+}
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                       tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                      tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01, xy45, xy23, xy67;
+    xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
+    xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
+    xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
+    xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
+    __m256 xy0145 = _v256_combine(xy01, xy45);
+    xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
+    xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
+    xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
+    xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
+    __m256 xy2367 = _v256_combine(xy23, xy67);
+
+    __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
+    __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
+
+    x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
+    y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
+    __m256d xy02 = _v256_combine(xy0, xy2);
+    __m256d xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
+    y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
+}
+
+////////// Matrix operations /////////
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
+    v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \
+        __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \
+        __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \
+        __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v256_extract_low(a.val));                  \
+        b1.val = intrin(_v256_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \
+    {                                                       \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \
+        return _Tpvec(intrin(a));                           \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
+
+
+/** Reinterpret **/
+// its up there with load and store operations
+
+/* de&interleave */
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)                 \
+    inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)       \
+    { return v256_load_deinterleave_##suffix(ptr, a, b); }                      \
+    inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)  \
+    { return v256_store_interleave_2ch(ptr, a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)   \
+    inline void v_load_deinterleave                               \
+    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)             \
+    { return v256_load_deinterleave_##suffix(ptr, a, b, c); }     \
+    inline void v_store_interleave                                \
+    (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c)  \
+    { return v256_store_interleave_##suffix(ptr, a, b, c); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)                    \
+    inline void v_load_deinterleave                                                \
+    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)                   \
+    { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); }                   \
+    inline void v_store_interleave                                                 \
+    (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
+    { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)       \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)     \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
+
+/* **** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+{
+    _Tpvec ab0, ab1;
+    v_zip(a, b, ab0, ab1);
+    v_store(ptr, ab0);
+    v_store(ptr + _Tpvec::nlanes, ab1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    _Tpvec ab0 = v256_load(ptr);
+    _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec ab00, ab11;
+    v_recombine(ab0, ab1, ab00, ab11);
+    v256_zip(ab00, ab11, a, b);
+}
+
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+    _Tpvec abc0 = v256_load(ptr);
+    _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+    _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
+    _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
+    _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
+
+    a = v256_unpacklo(ab0, ac1);
+    c = v256_unpackhi(ac1, bc1);
+    b = v256_alignr_64(bc1, ab0);
+}
+
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    _Tpvec ab0 = v256_unpacklo(a, b);
+    _Tpvec bc1 = v256_unpackhi(b, c);
+    _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a));
+
+    v_store(ptr, v256_combine_diagonal(ab0, ca10));
+    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
+    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+}
+
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec abcd0 = v256_load(ptr);
+    _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
+    _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+
+    _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
+    _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+
+    _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
+    _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
+    _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
+    _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(c, d, cd0, cd1);
+
+    _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
+    _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
+
+    v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
+    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
+    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
+    v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4,  uint64, l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4,   int64,  l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
+
+/* **** **** */
+//
+inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
+{
+    v_float32x8 ab0 = v256_load(ptr);
+    v_float32x8 ab1 = v256_load(ptr + 8);
+
+    v_float32x8 ab0ab2, ab1ab3;
+    v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+
+    a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
+    b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    v_float32x8 fa, fb;
+    v256_load_deinterleave_l8((float*)ptr, fa, fb);
+    a.val = v_reinterpret_as_u32(fa).val;
+    b.val = v_reinterpret_as_u32(fb).val;
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    _Tpvec ab0, ab1, bc0, bc1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(b, c, bc0, bc1);
+
+    _Tpvec cazg = v256_blend<0b10101010>(c, a);
+    _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
+    _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
+    _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0));
+
+    _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+    _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+    _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+    v_store(ptr, abc0);
+    v_store(ptr + _Tpvec::nlanes, abc1);
+    v_store(ptr + _Tpvec::nlanes * 2, abc2);
+}
+
+inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
+{
+    v_float32x8 ab0, ab1, bc0, bc1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(b, c, bc0, bc1);
+
+    v_float32x8 cazg = v256_blend<0b10101010>(c, a);
+    v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
+    v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
+
+    v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
+    v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+
+    v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+    v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+    v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+    v_store(ptr, abc0);
+    v_store(ptr + 8, abc1);
+    v_store(ptr + 16, abc2);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+    _Tpvec abc02 = v256_load(ptr);
+    _Tpvec abc1  = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+    _Tpvec abc2 = v256_alignr_128(abc02, abc20);
+    _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+
+    a = v256_blend<0b10010010>(abc0, abc1);
+    a = v256_blend<0b01000100>(a, abc2);
+
+    b = v256_blend<0b00100100>(abc0, abc1);
+    b = v256_blend<0b10011001>(b, abc2);
+
+    c = v256_blend<0b01001001>(abc0, abc1);
+    c = v256_blend<0b00100010>(c, abc2);
+
+    a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
+    b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
+    c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
+}
+/////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+    _Tpvec ac0, ac1, bd0, bd1;
+    v256_zip(a, c, ac0, ac1);
+    v256_zip(b, d, bd0, bd1);
+
+    _Tpvec abcd0, abcd1, abcd2, abcd3;
+    v256_zip(ac0, bd0, abcd0, abcd1);
+    v256_zip(ac1, bd1, abcd2, abcd3);
+
+    _Tpvec abcd01, abcd23, abcd45, abcd67;
+    v_recombine(abcd0, abcd1, abcd01, abcd45);
+    v_recombine(abcd2, abcd3, abcd23, abcd67);
+
+    v_store(ptr, abcd01);
+    v_store(ptr + _Tpvec::nlanes, abcd23);
+    v_store(ptr + _Tpvec::nlanes * 2, abcd45);
+    v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8,  unsigned, l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8,   int,      l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float,    l8)
+
+/* ********  ******** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+    );
+
+    _Tpvec ab0, ab1;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
+    v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
+    v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
+    v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
+
+    v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a));
+               cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
+
+    v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1);
+               ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
+
+    v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg);
+    v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0);
+
+    v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
+    v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
+
+    v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
+    v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
+               abc21 = v256_swap_halves(abc21);
+    v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
+
+    v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
+    v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
+    v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
+
+    v_store(ptr, _Tpvec(abc0.val));
+    v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
+    v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
+}
+// todo:
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16,  ushort, l16)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16,   short,  l16)
+
+/* **************** **************** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+
+    _Tpvec ab0, ab1;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+
+/// todo
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
+{}
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+    );
+
+    _Tpvec abcd0, abcd1, abcd2, abcd3;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
+    v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+
+    __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
+    __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
+    __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
+    __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
+
+    __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
+    __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
+    __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
+    __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
+
+    a.val = _mm256_unpacklo_epi64(ab0, ab1);
+    b.val = _mm256_unpackhi_epi64(ab0, ab1);
+    c.val = _mm256_unpacklo_epi64(cd0, cd1);
+    d.val = _mm256_unpackhi_epi64(cd0, cd1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32,  uchar, l32)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32,   schar, l32)
+
+inline void v256_cleanup() { _mm256_zeroupper(); }
+
+//! @name Check SIMD256 support
+//! @{
+//! @brief Check CPU capability of SIMD operation
+static inline bool hasSIMD256()
+{
+    return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
+}
+//! @}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index c7cbb578db..1f5f53100a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -247,8 +247,6 @@ template<typename _Tp, int n> struct v_reg
 {
 //! @cond IGNORED
     typedef _Tp lane_type;
-    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
-    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
     enum { nlanes = n };
 // !@endcond
 
@@ -797,11 +795,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 
 /** @brief Multiply and add
 
-Returns \f$ a*b + c \f$
-For floating point types and signed 32bit int only. */
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
 {
     v_reg<_Tp, n> d;
     for( int i = 0; i < n; i++ )
@@ -809,6 +807,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
     return d;
 }
 
+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
 /** @brief Dot product of elements
 
 Multiply values in two registers and sum adjacent result pairs.
@@ -1141,9 +1147,9 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }
 
 /** @brief Load register contents from memory (aligned)
@@ -1151,9 +1157,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }
 
 /** @brief Load 64-bits of data to lower part (high part is undefined).
@@ -1166,9 +1172,9 @@ v_int32x4 r = v_load_low(lo);
 @endcode
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = ptr[i];
@@ -1187,9 +1193,9 @@ v_int32x4 r = v_load_halves(lo, hi);
 @endcode
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = loptr[i];
@@ -1208,11 +1214,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
 For 8-, 16-, 32-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
 v_load_expand(const _Tp* ptr)
 {
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1229,11 +1235,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
 For 8-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
 v_load_expand_q(const _Tp* ptr)
 {
     typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1622,6 +1628,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
     return c;
 }
 
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
 /** @brief Convert to double
 
 Supported input type is cv::v_int32x4. */
@@ -1644,6 +1661,52 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
     return c;
 }
 
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
 /** @brief Transpose 4x4 matrix
 
 Scheme:
@@ -1968,6 +2031,8 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }
 
+inline void v_cleanup() {}
+
 //! @}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 9dadab57ea..fdb3ec09cb 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -280,11 +280,29 @@ struct v_float64x2
 
 #if CV_FP16
 // Workaround for old compilers
-template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
-{ return (int16x4_t)a; }
-template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
-{ return (float16x4_t)a; }
-template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
+static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
+static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
+static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
+static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
+
+static inline float16x8_t cv_vld1q_f16(const void* ptr)
+{
+#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
+    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
+#else
+    return vld1q_f16((const __fp16*)ptr);
+#endif
+}
+static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
+{
+#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
+    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
+#else
+    vst1q_f16((__fp16*)ptr, a);
+#endif
+}
+
+static inline float16x4_t cv_vld1_f16(const void* ptr)
 {
 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
     return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
@@ -292,7 +310,7 @@ template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
     return vld1_f16((const __fp16*)ptr);
 #endif
 }
-template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
+static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 {
 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
     vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
@@ -301,24 +319,28 @@ template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
 #endif
 }
 
-struct v_float16x4
+
+struct v_float16x8
 {
     typedef short lane_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };
 
-    v_float16x4() {}
-    explicit v_float16x4(float16x4_t v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() {}
+    explicit v_float16x8(float16x8_t v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
-        short v[] = {v0, v1, v2, v3};
-        val = cv_vld1_f16(v);
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = cv_vld1q_f16(v);
     }
     short get0() const
     {
-        return vget_lane_s16(vreinterpret_s16_f16(val), 0);
+        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
     }
-    float16x4_t val;
+    float16x8_t val;
 };
+
+inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
+inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
@@ -731,16 +753,32 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 }
 
-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
 }
 
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
 }
 
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@@ -753,9 +791,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 }
 
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
+    return v_fma(a, b, c);
 }
 #endif
 
@@ -841,10 +884,15 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 
 #if CV_FP16
 // Workaround for old comiplers
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(cv_vld1_f16(ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ cv_vst1_f16(ptr, a.val); }
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@@ -1293,6 +1341,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
 }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
@@ -1315,17 +1368,88 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 #endif
 
 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
+{
+    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
+}
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
 {
-    return v_float32x4(vcvt_f32_f16(a.val));
+    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
 }
 
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(vcvt_f16_f32(a.val));
+    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
 }
 #endif
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 8c61f44f4a..b79ea16a4d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -58,6 +58,17 @@ namespace cv
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_float32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float64x2;
+
 struct v_uint8x16
 {
     typedef uchar lane_type;
@@ -144,6 +155,7 @@ struct v_int16x8
     {
         return (short)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -163,6 +175,7 @@ struct v_uint32x4
     {
         return (unsigned)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -182,6 +195,7 @@ struct v_int32x4
     {
         return _mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -201,6 +215,7 @@ struct v_float32x4
     {
         return _mm_cvtss_f32(val);
     }
+
     __m128 val;
 };
 
@@ -222,6 +237,7 @@ struct v_uint64x2
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (unsigned)a | ((uint64)(unsigned)b << 32);
     }
+
     __m128i val;
 };
 
@@ -243,6 +259,7 @@ struct v_int64x2
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
     }
+
     __m128i val;
 };
 
@@ -262,29 +279,31 @@ struct v_float64x2
     {
         return _mm_cvtsd_f64(val);
     }
+
     __m128d val;
 };
 
-#if CV_FP16
-struct v_float16x4
+struct v_float16x8
 {
     typedef short lane_type;
     typedef __m128i vector_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };
 
-    v_float16x4() : val(_mm_setzero_si128()) {}
-    explicit v_float16x4(__m128i v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() : val(_mm_setzero_si128()) {}
+    explicit v_float16x8(__m128i v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
-        val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+        val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
     }
     short get0() const
     {
         return (short)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
-#endif
+inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
+inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
 
 namespace hal_sse_internal
 {
@@ -697,11 +716,15 @@ inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
 }
 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_int32x4(_mm_mullo_epi32(a.val, b.val));
+#else
     __m128i c0 = _mm_mul_epu32(a.val, b.val);
     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+#endif
 }
 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1027,11 +1050,35 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
     __m128i m = _mm_cmpgt_epi32(b.val, a.val);
     return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return a * b + c;
 }
 
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1040,17 +1087,16 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(_mm_sqrt_##suffix(res)); \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(res); \
+    return v_fma(a, a, b*b); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
-    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+    return v_fma(a, b, c); \
 }
 
 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
@@ -1268,12 +1314,15 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
 
-#if CV_FP16
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ _mm_storel_epi64((__m128i*)ptr, a.val); }
-#endif
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ _mm_storeu_si128((__m128i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ _mm_store_si128((__m128i*)ptr, a.val); }
 
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
@@ -2183,6 +2232,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
     return v_float32x4(_mm_cvtpd_ps(a.val));
 }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     return v_float64x2(_mm_cvtepi32_pd(a.val));
@@ -2200,21 +2254,82 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }
 
 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
 {
     return v_float32x4(_mm_cvtph_ps(a.val));
 }
 
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
+{
+    return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
+}
+
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(_mm_cvtps_ph(a.val, 0));
+    return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
 }
 #endif
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 8b76dd8487..069e9578eb 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -764,6 +764,8 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
 
@@ -836,6 +838,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
 
@@ -848,6 +853,48 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+inline void v_cleanup() {}
+
+
 /** Reinterpret **/
 /** its up there with load and store operations **/
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 9ca08da80a..baec499750 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -1165,7 +1165,7 @@ public:
     The method creates a full copy of the array. The original step[] is not taken into account. So, the
     array copy is a continuous array occupying total()*elemSize() bytes.
      */
-    Mat clone() const;
+    Mat clone() const CV_NODISCARD;
 
     /** @brief Copies the matrix to another one.
 
@@ -2252,7 +2252,7 @@ public:
     Mat_ row(int y) const;
     Mat_ col(int x) const;
     Mat_ diag(int d=0) const;
-    Mat_ clone() const;
+    Mat_ clone() const CV_NODISCARD;
 
     //! overridden forms of Mat::elemSize() etc.
     size_t elemSize() const;
@@ -2429,7 +2429,7 @@ public:
     static UMat diag(const UMat& d);
 
     //! returns deep copy of the matrix, i.e. the data is copied
-    UMat clone() const;
+    UMat clone() const CV_NODISCARD;
     //! copies the matrix content to "m".
     // It calls m.create(this->size(), this->type()).
     void copyTo( OutputArray m ) const;
@@ -2722,7 +2722,7 @@ public:
     SparseMat& operator = (const Mat& m);
 
     //! creates full copy of the matrix
-    SparseMat clone() const;
+    SparseMat clone() const CV_NODISCARD;
 
     //! copies all the data to the destination matrix. All the previous content of m is erased
     void copyTo( SparseMat& m ) const;
@@ -2959,7 +2959,7 @@ public:
     SparseMat_& operator = (const Mat& m);
 
     //! makes full copy of the matrix. All the elements are duplicated
-    SparseMat_ clone() const;
+    SparseMat_ clone() const CV_NODISCARD;
     //! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
     void create(int dims, const int* _sizes);
     //! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index be14227f47..d637a93cdc 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -847,7 +847,9 @@ bool Mat::isSubmatrix() const
 inline
 size_t Mat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }
 
 inline
@@ -3760,7 +3762,9 @@ bool UMat::isSubmatrix() const
 inline
 size_t UMat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }
 
 inline
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index cdeda40f7b..82de1f88d8 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -64,13 +64,14 @@ namespace cv
 ////////////////////////////// Small Matrix ///////////////////////////
 
 //! @cond IGNORED
-struct CV_EXPORTS Matx_AddOp {};
-struct CV_EXPORTS Matx_SubOp {};
-struct CV_EXPORTS Matx_ScaleOp {};
-struct CV_EXPORTS Matx_MulOp {};
-struct CV_EXPORTS Matx_DivOp {};
-struct CV_EXPORTS Matx_MatMulOp {};
-struct CV_EXPORTS Matx_TOp {};
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
 //! @endcond
 
 /** @brief Template class for small matrices whose type and size are known at compilation time
@@ -116,7 +117,7 @@ public:
     //! default constructor
     Matx();
 
-    Matx(_Tp v0); //!< 1x1 matrix
+    explicit Matx(_Tp v0); //!< 1x1 matrix
     Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
     Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
     Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index 75864ea822..d706d9664e 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -61,29 +61,44 @@ namespace cv
 namespace internal
 {
 
-template<typename _Tp, int m> struct Matx_FastInvOp
+template<typename _Tp, int m, int n> struct Matx_FastInvOp
+{
+    bool operator()(const Matx<_Tp, m, n>& a, Matx<_Tp, n, m>& b, int method) const
+    {
+        return invert(a, b, method) != 0;
+    }
+};
+
+template<typename _Tp, int m> struct Matx_FastInvOp<_Tp, m, m>
 {
     bool operator()(const Matx<_Tp, m, m>& a, Matx<_Tp, m, m>& b, int method) const
     {
-        Matx<_Tp, m, m> temp = a;
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
 
-        // assume that b is all 0's on input => make it a unity matrix
-        for( int i = 0; i < m; i++ )
-            b(i, i) = (_Tp)1;
+            // assume that b is all 0's on input => make it a unity matrix
+            for (int i = 0; i < m; i++)
+                b(i, i) = (_Tp)1;
 
-        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            if (method == DECOMP_CHOLESKY)
+                return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
 
-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+            return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        }
+        else
+        {
+            return invert(a, b, method) != 0;
+        }
     }
 };
 
-template<typename _Tp> struct Matx_FastInvOp<_Tp, 2>
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 2, 2>
 {
-    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int) const
+    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int /*method*/) const
     {
         _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
             return false;
         d = 1/d;
         b(1,1) = a(0,0)*d;
@@ -94,12 +109,12 @@ template<typename _Tp> struct Matx_FastInvOp<_Tp, 2>
     }
 };
 
-template<typename _Tp> struct Matx_FastInvOp<_Tp, 3>
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 3, 3>
 {
-    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int) const
+    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int /*method*/) const
     {
         _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
             return false;
         d = 1/d;
         b(0,0) = (a(1,1) * a(2,2) - a(1,2) * a(2,1)) * d;
@@ -118,27 +133,43 @@ template<typename _Tp> struct Matx_FastInvOp<_Tp, 3>
 };
 
 
-template<typename _Tp, int m, int n> struct Matx_FastSolveOp
+template<typename _Tp, int m, int l, int n> struct Matx_FastSolveOp
+{
+    bool operator()(const Matx<_Tp, m, l>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, l, n>& x, int method) const
+    {
+        return cv::solve(a, b, x, method);
+    }
+};
+
+template<typename _Tp, int m, int n> struct Matx_FastSolveOp<_Tp, m, m, n>
 {
     bool operator()(const Matx<_Tp, m, m>& a, const Matx<_Tp, m, n>& b,
                     Matx<_Tp, m, n>& x, int method) const
     {
-        Matx<_Tp, m, m> temp = a;
-        x = b;
-        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+            x = b;
+            if( method == DECOMP_CHOLESKY )
+                return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
 
-        return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+            return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+        }
+        else
+        {
+            return cv::solve(a, b, x, method);
+        }
     }
 };
 
-template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 1>
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 2, 1>
 {
     bool operator()(const Matx<_Tp, 2, 2>& a, const Matx<_Tp, 2, 1>& b,
                     Matx<_Tp, 2, 1>& x, int) const
     {
         _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
             return false;
         d = 1/d;
         x(0) = (b(0)*a(1,1) - b(1)*a(0,1))*d;
@@ -147,13 +178,13 @@ template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 1>
     }
 };
 
-template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 1>
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 3, 1>
 {
     bool operator()(const Matx<_Tp, 3, 3>& a, const Matx<_Tp, 3, 1>& b,
                     Matx<_Tp, 3, 1>& x, int) const
     {
         _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
             return false;
         d = 1/d;
         x(0) = d*(b(0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1)) -
@@ -193,15 +224,8 @@ template<typename _Tp, int m, int n> inline
 Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
 {
     Matx<_Tp, n, m> b;
-    bool ok;
-    if( m == n && (method == DECOMP_LU || method == DECOMP_CHOLESKY) )
-        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*reinterpret_cast<const Matx<_Tp, m, m>*>(this), reinterpret_cast<Matx<_Tp, m, m>&>(b), method);
-    else
-    {
-        Mat A(*this, false), B(b, false);
-        ok = (invert(A, B, method) != 0);
-    }
-    if( NULL != p_is_ok ) { *p_is_ok = ok; }
+    bool ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
+    if (p_is_ok) *p_is_ok = ok;
     return ok ? b : Matx<_Tp, n, m>::zeros();
 }
 
@@ -209,15 +233,7 @@ template<typename _Tp, int m, int n> template<int l> inline
 Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) const
 {
     Matx<_Tp, n, l> x;
-    bool ok;
-    if( method == DECOMP_LU || method == DECOMP_CHOLESKY )
-        ok = cv::internal::Matx_FastSolveOp<_Tp, m, l>()(*this, rhs, x, method);
-    else
-    {
-        Mat A(*this, false), B(rhs, false), X(x, false);
-        ok = cv::solve(A, B, X, method);
-    }
-
+    bool ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
     return ok ? x : Matx<_Tp, n, l>::zeros();
 }
 
diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp
index 325ef5fb7c..4a7298bc0e 100644
--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -61,7 +61,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone,
 
     TEST_CYCLE()
     {
-        source.clone();
+        Mat tmp = source.clone();
+        (void)tmp;
     }
     destination = source.clone();
 
@@ -88,7 +89,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
 
     TEST_CYCLE()
     {
-        roi.clone();
+        Mat tmp = roi.clone();
+        (void)tmp;
     }
     destination = roi.clone();
 
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 508bebc726..c1653d7add 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1180,7 +1180,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
                op == CMP_NE || op == CMP_GE || op == CMP_GT );
 
-    if(_src1.empty() && _src2.empty())
+    if(_src1.empty() || _src2.empty())
     {
         _dst.release();
         return;
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index 98306d35a6..11e9868617 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -2916,12 +2916,29 @@ cvInitImageHeader( IplImage * image, CvSize size, int depth,
     if( !image )
         CV_Error( CV_HeaderIsNull, "null pointer to header" );
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
     memset( image, 0, sizeof( *image ));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
     image->nSize = sizeof( *image );
 
     icvGetColorModel( channels, &colorModel, &channelSeq );
-    strncpy( image->colorModel, colorModel, 4 );
-    strncpy( image->channelSeq, channelSeq, 4 );
+    for (int i = 0; i < 4; i++)
+    {
+        image->colorModel[i] = colorModel[i];
+        if (colorModel[i] == 0)
+            break;
+    }
+    for (int i = 0; i < 4; i++)
+    {
+        image->channelSeq[i] = channelSeq[i];
+        if (channelSeq[i] == 0)
+            break;
+    }
 
     if( size.width < 0 || size.height < 0 )
         CV_Error( CV_BadROISize, "Bad input roi" );
diff --git a/modules/core/src/batch_distance.cpp b/modules/core/src/batch_distance.cpp
index 1fd088dd5e..4c90db7ec4 100644
--- a/modules/core/src/batch_distance.cpp
+++ b/modules/core/src/batch_distance.cpp
@@ -263,6 +263,7 @@ void cv::batchDistance( InputArray _src1, InputArray _src2,
     if( crosscheck )
     {
         CV_Assert( K == 1 && update == 0 && mask.empty() );
+        CV_Assert(!nidx.empty());
         Mat tdist, tidx;
         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
 
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index 1ecd528c39..92a3b6006e 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -44,7 +44,7 @@ static const char* getTestOpMath(unsigned testOp)
 const char* depthToString_(int depth)
 {
     static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
-    return depth <= CV_USRTYPE1 ? depthNames[depth] : NULL;
+    return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
 }
 
 const cv::String typeToString_(int type)
diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp
index 6c71093e57..7168e8d643 100644
--- a/modules/core/src/convert.fp16.cpp
+++ b/modules/core/src/convert.fp16.cpp
@@ -81,10 +81,9 @@ void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t
         for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
         {
             float32x4_t v_src = vld1q_f32(src + x);
-
             float16x4_t v_dst = vcvt_f16_f32(v_src);
 
-            cv_vst1_f16((__fp16*)dst + x, v_dst);
+            cv_vst1_f16(dst + x, v_dst);
         }
 
         for ( ; x < size.width; x++ )
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index b8a52f2f5a..8775bff4aa 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -411,6 +411,8 @@ Mat& Mat::operator = (const Scalar& s)
 {
     CV_INSTRUMENT_REGION()
 
+    if (empty()) return *this;
+
     const Mat* arrays[] = { this };
     uchar* dptr;
     NAryMatIterator it(arrays, &dptr, 1);
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 2ec808a7a6..95abe71288 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -1100,6 +1100,9 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
     CV_Assert( type == _src2.type() && (type == CV_32F || type == CV_64F) );
 
     method &= ~DECOMP_NORMAL;
+    CV_Check(method, method == DECOMP_LU || method == DECOMP_SVD || method == DECOMP_EIG ||
+                     method == DECOMP_CHOLESKY || method == DECOMP_QR,
+             "Unsupported method, see #DecompTypes");
     CV_Assert( (method != DECOMP_LU && method != DECOMP_CHOLESKY) ||
         is_normal || src.rows == src.cols );
 
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index b10bab6d63..354cc00421 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -22,7 +22,6 @@ void log32f(const float *src, float *dst, int n);
 void log64f(const double *src, double *dst, int n);
 float fastAtan2(float y, float x);
 
-
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 using namespace std;
@@ -36,162 +35,140 @@ static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
 
 using namespace cv;
 
-#if CV_SIMD128
-
-template <typename T>
-struct v_atan
+static inline float atan_f32(float y, float x)
 {
-    typedef V_RegTrait128<T> Trait;
-    typedef typename Trait::reg VT; // vector type
-    enum { WorkWidth = VT::nlanes * 2 };
-
-    v_atan(const T & scale)
-        : s(Trait::all(scale))
+    float ax = std::abs(x), ay = std::abs(y);
+    float a, c, c2;
+    if( ax >= ay )
     {
-        eps = Trait::all(DBL_EPSILON);
-        z = Trait::zero();
-        p7 = Trait::all(atan2_p7);
-        p5 = Trait::all(atan2_p5);
-        p3 = Trait::all(atan2_p3);
-        p1 = Trait::all(atan2_p1);
-        val90 = Trait::all(90.f);
-        val180 = Trait::all(180.f);
-        val360 = Trait::all(360.f);
+        c = ay/(ax + (float)DBL_EPSILON);
+        c2 = c*c;
+        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
     }
+    else
+    {
+        c = ax/(ay + (float)DBL_EPSILON);
+        c2 = c*c;
+        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+    }
+    if( x < 0 )
+        a = 180.f - a;
+    if( y < 0 )
+        a = 360.f - a;
+    return a;
+}
 
-    inline int operator()(int len, const T * Y, const T * X, T * angle)
+#if CV_SIMD
+
+struct v_atan_f32
+{
+    explicit v_atan_f32(const float& scale)
     {
-        int i = 0;
-        const int c = VT::nlanes;
-        for ( ; i <= len - c * 2; i += c * 2)
-        {
-            VT x1 = v_load(X + i);
-            VT x2 = v_load(X + i + c);
-            VT y1 = v_load(Y + i);
-            VT y2 = v_load(Y + i + c);
-            v_store(&angle[i], s * one(x1, y1));
-            v_store(&angle[i + c], s * one(x2, y2));
-        }
-        return i;
+        eps = vx_setall_f32((float)DBL_EPSILON);
+        z = vx_setzero_f32();
+        p7 = vx_setall_f32(atan2_p7);
+        p5 = vx_setall_f32(atan2_p5);
+        p3 = vx_setall_f32(atan2_p3);
+        p1 = vx_setall_f32(atan2_p1);
+        val90 = vx_setall_f32(90.f);
+        val180 = vx_setall_f32(180.f);
+        val360 = vx_setall_f32(360.f);
+        s = vx_setall_f32(scale);
     }
 
-private:
-    inline VT one(VT & x, VT & y)
+    v_float32 compute(const v_float32& y, const v_float32& x)
     {
-        VT ax = v_abs(x);
-        VT ay = v_abs(y);
-        VT c = v_min(ax, ay) / (v_max(ax, ay) + eps);
-        VT cc = c * c;
-        VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c;
+        v_float32 ax = v_abs(x);
+        v_float32 ay = v_abs(y);
+        v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
+        v_float32 cc = c * c;
+        v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
         a = v_select(ax >= ay, a, val90 - a);
         a = v_select(x < z, val180 - a, a);
         a = v_select(y < z, val360 - a, a);
-        return a;
+        return a * s;
     }
 
-private:
-    VT eps;
-    VT z;
-    VT p7;
-    VT p5;
-    VT p3;
-    VT p1;
-    VT val90;
-    VT val180;
-    VT val360;
-    VT s;
+    v_float32 eps;
+    v_float32 z;
+    v_float32 p7;
+    v_float32 p5;
+    v_float32 p3;
+    v_float32 p1;
+    v_float32 val90;
+    v_float32 val180;
+    v_float32 val360;
+    v_float32 s;
 };
 
-#if !CV_SIMD128_64F
+#endif
+
+} // anonymous::
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
 
-// emulation
-template <>
-struct v_atan<double>
+static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
-    v_atan(double scale) : impl(static_cast<float>(scale)) {}
-    inline int operator()(int len, const double * Y, const double * X, double * angle)
+    float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
+    int i = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    v_atan_f32 v(scale);
+
+    for( ; i < len; i += VECSZ*2 )
     {
-        int i = 0;
-        const int c = v_atan<float>::WorkWidth;
-        float bufY[c];
-        float bufX[c];
-        float bufA[c];
-        for ( ; i <= len - c ; i += c)
+        if( i + VECSZ*2 > len )
         {
-            for (int j = 0; j < c; ++j)
-            {
-                bufY[j] = static_cast<float>(Y[i + j]);
-                bufX[j] = static_cast<float>(X[i + j]);
-            }
-            impl(c, bufY, bufX, bufA);
-            for (int j = 0; j < c; ++j)
-            {
-                angle[i + j] = bufA[j];
-            }
+            // if it's inplace operation, we cannot repeatedly process
+            // the tail for the second time, so we have to use the
+            // scalar code
+            if( i == 0 || angle == X || angle == Y )
+                break;
+            i = len - VECSZ*2;
         }
-        return i;
-    }
-private:
-    v_atan<float> impl;
-};
-#endif
-
-#endif
 
-template <typename T>
-static inline T atanImpl(T y, T x)
-{
-    T ax = std::abs(x), ay = std::abs(y);
-    T a, c, c2;
-    if( ax >= ay )
-    {
-        c = ay/(ax + static_cast<T>(DBL_EPSILON));
-        c2 = c*c;
-        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    else
-    {
-        c = ax/(ay + static_cast<T>(DBL_EPSILON));
-        c2 = c*c;
-        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    if( x < 0 )
-        a = 180.f - a;
-    if( y < 0 )
-        a = 360.f - a;
-    return a;
-}
+        v_float32 y0 = vx_load(Y + i);
+        v_float32 x0 = vx_load(X + i);
+        v_float32 y1 = vx_load(Y + i + VECSZ);
+        v_float32 x1 = vx_load(X + i + VECSZ);
 
-template <typename T>
-static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees)
-{
-    int i = 0;
-    T scale = angleInDegrees ? 1 : static_cast<T>(CV_PI/180);
+        v_float32 r0 = v.compute(y0, x0);
+        v_float32 r1 = v.compute(y1, x1);
 
-#if CV_SIMD128
-    i = v_atan<T>(scale)(len, Y, X, angle);
+        v_store(angle + i, r0);
+        v_store(angle + i + VECSZ, r1);
+    }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
-    {
-        angle[i] = atanImpl<T>(Y[i], X[i]) * scale;
-    }
+        angle[i] = atan_f32(Y[i], X[i])*scale;
 }
 
-} // anonymous::
-
-///////////////////////////////////// ATAN2 ////////////////////////////////////
-
 void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
     CV_INSTRUMENT_REGION()
-    atanImpl<float>(Y, X, angle, len, angleInDegrees);
+    fastAtan32f_(Y, X, angle, len, angleInDegrees );
 }
 
 void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
 {
     CV_INSTRUMENT_REGION()
-    atanImpl<double>(Y, X, angle, len, angleInDegrees);
+
+    const int BLKSZ = 128;
+    float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];
+    for( int i = 0; i < len; i += BLKSZ )
+    {
+        int j, blksz = std::min(BLKSZ, len - i);
+        for( j = 0; j < blksz; j++ )
+        {
+            ybuf[j] = (float)Y[i + j];
+            xbuf[j] = (float)X[i + j];
+        }
+        fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);
+        for( j = 0; j < blksz; j++ )
+            angle[i + j] = abuf[j];
+    }
 }
 
 // deprecated
@@ -207,16 +184,24 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
-        v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || mag == x || mag == y )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+        v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
         x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
         x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
         v_store(mag + i, x0);
-        v_store(mag + i + 4, x1);
+        v_store(mag + i + VECSZ, x1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -232,16 +217,24 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD128_64F
-    for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
-        v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || mag == x || mag == y )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+        v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
         x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
         x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
         v_store(mag + i, x0);
-        v_store(mag + i + 2, x1);
+        v_store(mag + i + VECSZ, x1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -258,14 +251,22 @@ void invSqrt32f(const float* src, float* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_invsqrt(t0);
         t1 = v_invsqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -276,13 +277,23 @@ void invSqrt32f(const float* src, float* dst, int len)
 void invSqrt64f(const double* src, double* dst, int len)
 {
     CV_INSTRUMENT_REGION()
-
     int i = 0;
 
-#if CV_SSE2
-    __m128d v_1 = _mm_set1_pd(1.0);
-    for ( ; i <= len - 2; i += 2)
-        _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for ( ; i < len; i += VECSZ*2)
+    {
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
+        t0 = v_invsqrt(t0);
+        t1 = v_invsqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
+    }
 #endif
 
     for( ; i < len; i++ )
@@ -296,14 +307,22 @@ void sqrt32f(const float* src, float* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_sqrt(t0);
         t1 = v_sqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -317,14 +336,22 @@ void sqrt64f(const double* src, double* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128_64F
-    for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_sqrt(t0);
         t1 = v_sqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 2, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -377,21 +404,6 @@ void log64f(const double *src, double *dst, int n)
 
 ////////////////////////////////////// EXP /////////////////////////////////////
 
-typedef union
-{
-    struct {
-#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
-        int hi;
-        int lo;
-#else
-        int lo;
-        int hi;
-#endif
-    } i;
-    double d;
-}
-DBLINT;
-
 #define EXPTAB_SCALE 6
 #define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
 
@@ -464,6 +476,8 @@ static const double expTab[] = {
     1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
 };
 
+static float expTab_f[EXPTAB_MASK+1];
+static volatile bool extTab_f_initialized = false;
 
 // the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
 #if (defined _MSC_VER && _MSC_VER < 1500) || \
@@ -480,283 +494,117 @@ void exp32f( const float *_x, float *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
+    if( !extTab_f_initialized )
+    {
+        for( int j = 0; j <= EXPTAB_MASK; j++ )
+            expTab_f[j] = (float)expTab[j];
+        extTab_f_initialized = true;
+    }
+
     static const float
     A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
     A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
     A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
     A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
 
-#undef EXPPOLY
-#define EXPPOLY(x)  \
-(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
-
     int i = 0;
     const Cv32suf* x = (const Cv32suf*)_x;
-    Cv32suf buf[4];
-
-#if CV_AVX2
-    if( n >= 8 )
+    float minval = (float)(-exp_max_val/exp_prescale);
+    float maxval = (float)(exp_max_val/exp_prescale);
+    float postscale = (float)exp_postscale;
+
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    static const v_float32 vminval = vx_setall_f32(minval);
+    static const v_float32 vmaxval = vx_setall_f32(maxval);
+
+    static const v_float32 vA1 = vx_setall_f32((float)A1);
+    static const v_float32 vA2 = vx_setall_f32((float)A2);
+    static const v_float32 vA3 = vx_setall_f32((float)A3);
+    static const v_float32 vA4 = vx_setall_f32((float)A4);
+
+    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+    for( ; i < n; i += VECSZ*2 )
     {
-        static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
-        static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m256 mA1 = _mm256_set1_ps(A1);
-        static const __m256 mA2 = _mm256_set1_ps(A2);
-        static const __m256 mA3 = _mm256_set1_ps(A3);
-        static const __m256 mA4 = _mm256_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 32 == 0;
-
-        ushort CV_DECL_ALIGNED(32) tab_idx[16];
-
-        for( ; i <= n - 8; i += 8 )
+        if( i + VECSZ*2 > n )
         {
-            __m128i xi0, xi1;
-
-            __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
-            __m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
-
-            xd0 = _mm256_mul_pd(xd0, prescale4);
-            xd1 = _mm256_mul_pd(xd1, prescale4);
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ*2;
+            y_aligned = false;
+        }
 
-            xi0 = _mm256_cvtpd_epi32(xd0);
-            xi1 = _mm256_cvtpd_epi32(xd1);
+        v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
 
-            xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
-            xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
+        xf0 = v_min(v_max(xf0, vminval), vmaxval);
+        xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-            // gcc does not support _mm256_set_m128
-            //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
-            __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
+        xf0 *= vprescale;
+        xf1 *= vprescale;
 
-            xf = _mm256_mul_ps(xf, postscale8);
+        v_int32 xi0 = v_round(xf0);
+        v_int32 xi1 = v_round(xf1);
+        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
+        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
 
-            xi0 = _mm_packs_epi32(xi0, xi1);
+        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
+        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
 
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+        v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
+        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
+        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
 
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
+        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
 
-            __m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
-            __m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
+        v_float32 zf0 = xf0 + vA1;
+        v_float32 zf1 = xf1 + vA1;
 
-            // gcc does not support _mm256_set_m128
-            //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
-            __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf1 = v_fma(zf1, xf1, vA2);
 
-            //_mm256_set_m128i(xi1, xi0)
-            __m256i temp = _mm256_castps_si256(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(xi0)), _mm_castsi128_ps(xi1), 1));
+        zf0 = v_fma(zf0, xf0, vA3);
+        zf1 = v_fma(zf1, xf1, vA3);
 
-            yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
+        zf0 = v_fma(zf0, xf0, vA4);
+        zf1 = v_fma(zf1, xf1, vA4);
 
-            __m256 zf = _mm256_add_ps(xf, mA1);
+        zf0 *= yf0;
+        zf1 *= yf1;
 
-#if CV_FMA3
-            zf = _mm256_fmadd_ps(zf, xf, mA2);
-            zf = _mm256_fmadd_ps(zf, xf, mA3);
-            zf = _mm256_fmadd_ps(zf, xf, mA4);
-#else
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
-#endif
-            zf = _mm256_mul_ps(zf, yf);
-
-            if( y_aligned )
-            {
-                _mm256_store_ps(y + i, zf);
-            }
-            else
-            {
-                _mm256_storeu_ps(y + i, zf);
-            }
+        if( y_aligned )
+        {
+            v_store_aligned(y + i, zf0);
+            v_store_aligned(y + i + VECSZ, zf1);
         }
-    }
-#elif CV_SSE2
-    if( n >= 8 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-        static const __m128 mA3 = _mm_set1_ps(A3);
-        static const __m128 mA4 = _mm_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 16 == 0;
-
-        ushort CV_DECL_ALIGNED(16) tab_idx[8];
-
-        for( ; i <= n - 8; i += 8 )
+        else
         {
-            __m128 xf0, xf1;
-            xf0 = _mm_loadu_ps(&x[i].f);
-            xf1 = _mm_loadu_ps(&x[i+4].f);
-            __m128i xi0, xi1, xi2, xi3;
-
-            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
-            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
-
-            __m128d xd0 = _mm_cvtps_pd(xf0);
-            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
-            __m128d xd1 = _mm_cvtps_pd(xf1);
-            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
-
-            xd0 = _mm_mul_pd(xd0, prescale2);
-            xd2 = _mm_mul_pd(xd2, prescale2);
-            xd1 = _mm_mul_pd(xd1, prescale2);
-            xd3 = _mm_mul_pd(xd3, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xd0);
-            xi2 = _mm_cvtpd_epi32(xd2);
-
-            xi1 = _mm_cvtpd_epi32(xd1);
-            xi3 = _mm_cvtpd_epi32(xd3);
-
-            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
-            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
-            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
-            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
-
-            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
-            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
-
-            xf0 = _mm_mul_ps(xf0, postscale4);
-            xf1 = _mm_mul_ps(xf1, postscale4);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi2);
-            xi1 = _mm_unpacklo_epi64(xi1, xi3);
-            xi0 = _mm_packs_epi32(xi0, xi1);
-
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-
-            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
-            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
-
-            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
-            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
-
-            __m128 zf0 = _mm_add_ps(xf0, mA1);
-            __m128 zf1 = _mm_add_ps(xf1, mA1);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
-
-            zf0 = _mm_mul_ps(zf0, yf0);
-            zf1 = _mm_mul_ps(zf1, yf1);
-
-            if( y_aligned )
-            {
-                _mm_store_ps(y + i, zf0);
-                _mm_store_ps(y + i + 4, zf1);
-            }
-            else
-            {
-                _mm_storeu_ps(y + i, zf0);
-                _mm_storeu_ps(y + i + 4, zf1);
-            }
+            v_store(y + i, zf0);
+            v_store(y + i + VECSZ, zf1);
         }
     }
-    else
+    vx_cleanup();
 #endif
-        for( ; i <= n - 4; i += 4 )
-        {
-            double x0 = x[i].f * exp_prescale;
-            double x1 = x[i + 1].f * exp_prescale;
-            double x2 = x[i + 2].f * exp_prescale;
-            double x3 = x[i + 3].f * exp_prescale;
-            int val0, val1, val2, val3, t;
-
-            if( ((x[i].i >> 23) & 255) > 127 + 10 )
-                x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
-                x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
-                x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
-                x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
-            val0 = cvRound(x0);
-            val1 = cvRound(x1);
-            val2 = cvRound(x2);
-            val3 = cvRound(x3);
-
-            x0 = (x0 - val0)*exp_postscale;
-            x1 = (x1 - val1)*exp_postscale;
-            x2 = (x2 - val2)*exp_postscale;
-            x3 = (x3 - val3)*exp_postscale;
-
-            t = (val0 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[0].i = t << 23;
-
-            t = (val1 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[1].i = t << 23;
-
-            t = (val2 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[2].i = t << 23;
-
-            t = (val3 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[3].i = t << 23;
-
-            x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-            x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-            y[i] = (float)x0;
-            y[i + 1] = (float)x1;
-
-            x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-            x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-            y[i + 2] = (float)x2;
-            y[i + 3] = (float)x3;
-        }
 
     for( ; i < n; i++ )
     {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
+        float x0 = x[i].f;
+        x0 = std::min(std::max(x0, minval), maxval);
+        x0 *= (float)exp_prescale;
+        Cv32suf buf;
 
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+        int xi = saturate_cast<int>(x0);
+        x0 = (x0 - xi)*postscale;
 
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 127;
+        int t = (xi >> EXPTAB_SCALE) + 127;
         t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+        buf.i = t << 23;
 
-        buf[0].i = t << 23;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+        y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
     }
 }
 
@@ -772,162 +620,111 @@ void exp64f( const double *_x, double *y, int n )
     A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
     A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
 
-#undef EXPPOLY
-#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
     int i = 0;
-    Cv64suf buf[4];
     const Cv64suf* x = (const Cv64suf*)_x;
-
-#if CV_SSE2
-    static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-    static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
-    static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
-    static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
-
-    static const __m128d mA0 = _mm_set1_pd(A0);
-    static const __m128d mA1 = _mm_set1_pd(A1);
-    static const __m128d mA2 = _mm_set1_pd(A2);
-    static const __m128d mA3 = _mm_set1_pd(A3);
-    static const __m128d mA4 = _mm_set1_pd(A4);
-    static const __m128d mA5 = _mm_set1_pd(A5);
-
-    int CV_DECL_ALIGNED(16) tab_idx[4];
-
-    for( ; i <= n - 4; i += 4 )
+    double minval = (-exp_max_val/exp_prescale);
+    double maxval = (exp_max_val/exp_prescale);
+
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    static const v_float64 vminval = vx_setall_f64(minval);
+    static const v_float64 vmaxval = vx_setall_f64(maxval);
+
+    static const v_float64 vA1 = vx_setall_f64(A1);
+    static const v_float64 vA2 = vx_setall_f64(A2);
+    static const v_float64 vA3 = vx_setall_f64(A3);
+    static const v_float64 vA4 = vx_setall_f64(A4);
+    static const v_float64 vA5 = vx_setall_f64(A5);
+
+    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+    for( ; i < n; i += VECSZ*2 )
     {
-        __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
-        __m128i xi0, xi1;
-        xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
-        xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
-        xf0 = _mm_mul_pd(xf0, prescale2);
-        xf1 = _mm_mul_pd(xf1, prescale2);
-
-        xi0 = _mm_cvtpd_epi32(xf0);
-        xi1 = _mm_cvtpd_epi32(xf1);
-        xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
-        xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
-
-        xi0 = _mm_unpacklo_epi64(xi0, xi1);
-        _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
-
-        xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
-        xi0 = _mm_packs_epi32(xi0, xi0);
-        xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-        xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
-        xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-        xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
-        xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
-
-        __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-        __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-        yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
-        yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
-
-        __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
-        __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
-
-        zf0 = _mm_mul_pd(zf0, yf0);
-        zf1 = _mm_mul_pd(zf1, yf1);
-
-        _mm_storeu_pd(y + i, zf0);
-        _mm_storeu_pd(y + i + 2, zf1);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
+        if( i + VECSZ*2 > n )
+        {
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ*2;
+            y_aligned = false;
+        }
 
-        double y0, y1, y2, y3;
-        int val0, val1, val2, val3, t;
+        v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
 
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
+        xf0 = v_min(v_max(xf0, vminval), vmaxval);
+        xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        t = (int)(x[i+1].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x1 = t < 0 ? -exp_max_val : exp_max_val;
+        xf0 *= vprescale;
+        xf1 *= vprescale;
 
-        t = (int)(x[i+2].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x2 = t < 0 ? -exp_max_val : exp_max_val;
+        v_int32 xi0 = v_round(xf0);
+        v_int32 xi1 = v_round(xf1);
+        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
+        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
 
-        t = (int)(x[i+3].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x3 = t < 0 ? -exp_max_val : exp_max_val;
+        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
+        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
 
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
+        v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
+        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
+        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
 
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
+        v_int64 xq0, xq1, dummy;
+        v_expand(xi0, xq0, dummy);
+        v_expand(xi1, xq1, dummy);
 
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[0].i = (int64)t << 52;
+        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
+        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
 
-        t = (val1 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[1].i = (int64)t << 52;
+        v_float64 zf0 = xf0 + vA1;
+        v_float64 zf1 = xf1 + vA1;
 
-        t = (val2 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[2].i = (int64)t << 52;
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf1 = v_fma(zf1, xf1, vA2);
 
-        t = (val3 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[3].i = (int64)t << 52;
+        zf0 = v_fma(zf0, xf0, vA3);
+        zf1 = v_fma(zf1, xf1, vA3);
 
-        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+        zf0 = v_fma(zf0, xf0, vA4);
+        zf1 = v_fma(zf1, xf1, vA4);
 
-        y[i] = y0;
-        y[i + 1] = y1;
+        zf0 = v_fma(zf0, xf0, vA5);
+        zf1 = v_fma(zf1, xf1, vA5);
 
-        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+        zf0 *= yf0;
+        zf1 *= yf1;
 
-        y[i + 2] = y2;
-        y[i + 3] = y3;
+        if( y_aligned )
+        {
+            v_store_aligned(y + i, zf0);
+            v_store_aligned(y + i + VECSZ, zf1);
+        }
+        else
+        {
+            v_store(y + i, zf0);
+            v_store(y + i + VECSZ, zf1);
+        }
     }
+    vx_cleanup();
+#endif
 
     for( ; i < n; i++ )
     {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
+        double x0 = x[i].f;
+        x0 = std::min(std::max(x0, minval), maxval);
+        x0 *= exp_prescale;
+        Cv64suf buf;
 
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
+        int xi = saturate_cast<int>(x0);
+        x0 = (x0 - xi)*exp_postscale;
 
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 1023;
+        int t = (xi >> EXPTAB_SCALE) + 1023;
         t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf.i = (int64)t << 52;
 
-        buf[0].i = (int64)t << 52;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+        y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
     }
 }
 
@@ -937,12 +734,10 @@ void exp64f( const double *_x, double *y, int n )
 
 /////////////////////////////////////////// LOG ///////////////////////////////////////
 
-#define LOGTAB_SCALE    8
+#define LOGTAB_SCALE        8
 #define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
-#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
-#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
 
-static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
+static const double CV_DECL_ALIGNED(16) logTab[] = {
     0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
     .00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
     .00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
@@ -1201,154 +996,85 @@ static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
     .69314718055994530941723212145818, 5.0e-01,
 };
 
+static float logTab_f[(LOGTAB_MASK+1)*2];
+static volatile bool logTab_f_initialized = false;
 
-
-#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
+#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])
 static const double ln_2 = 0.69314718055994530941723212145818;
 
 void log32f( const float *_x, float *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
-    static const float shift[] = { 0, -1.f/512 };
+    if( !logTab_f_initialized )
+    {
+        for( int j = 0; j < (LOGTAB_MASK+1)*2; j++ )
+            logTab_f[j] = (float)logTab[j];
+        logTab_f_initialized = true;
+    }
+
+    static const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
     static const float
     A0 = 0.3333333333333333333333333f,
     A1 = -0.5f,
     A2 = 1.f;
 
-#undef LOGPOLY
-#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
-
     int i = 0;
-    Cv32suf buf[4];
     const int* x = (const int*)_x;
 
-#if CV_SSE2
-    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-    static const __m128 _1_4 = _mm_set1_ps(1.f);
-    static const __m128 shift4 = _mm_set1_ps(-1.f/512);
-
-    static const __m128 mA0 = _mm_set1_ps(A0);
-    static const __m128 mA1 = _mm_set1_ps(A1);
-    static const __m128 mA2 = _mm_set1_ps(A2);
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    static const v_float32 v1 = vx_setall_f32(1.f);
+    static const v_float32 vshift = vx_setall_f32(-1.f/512);
 
-    int CV_DECL_ALIGNED(16) idx[4];
+    static const v_float32 vA0 = vx_setall_f32(A0);
+    static const v_float32 vA1 = vx_setall_f32(A1);
+    static const v_float32 vA2 = vx_setall_f32(A2);
 
-    for( ; i <= n - 4; i += 4 )
+    for( ; i < n; i += VECSZ )
     {
-        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
-        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
-
-        __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
-
-        h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
-        _mm_store_si128((__m128i*)idx, h0);
-        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-        __m128d t0, t1, t2, t3, t4;
-        t0 = _mm_load_pd(icvLogTab + idx[0]);
-        t2 = _mm_load_pd(icvLogTab + idx[1]);
-        t1 = _mm_unpackhi_pd(t0, t2);
-        t0 = _mm_unpacklo_pd(t0, t2);
-        t2 = _mm_load_pd(icvLogTab + idx[2]);
-        t4 = _mm_load_pd(icvLogTab + idx[3]);
-        t3 = _mm_unpackhi_pd(t2, t4);
-        t2 = _mm_unpacklo_pd(t2, t4);
-
-        yd0 = _mm_add_pd(yd0, t0);
-        yd1 = _mm_add_pd(yd1, t2);
-
-        __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-
-        __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
-        xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
-        xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
-
-        __m128 zf0 = _mm_mul_ps(xf0, mA0);
-        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
-        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
-        yf0 = _mm_add_ps(yf0, zf0);
-
-        _mm_storeu_ps(y + i, yf0);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = x[i];
-        h1 = x[i+1];
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
-
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = x[i+2];
-        h3 = x[i+3];
-
-        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
-
-        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
-        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
+        if( i + VECSZ > n )
+        {
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ;
+        }
 
-        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        v_int32 h0 = vx_load(x + i);
+        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
+        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
 
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
+        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
+        v_float32 yf0, xf0;
 
-        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
+        v_lut_deinterleave(logTab_f, h0, yf0, xf0);
 
-        x0 += shift[h0 == 510];
-        x1 += shift[h1 == 510];
-        y0 += LOGPOLY( x0 );
-        y1 += LOGPOLY( x1 );
+        yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
 
-        y[i] = (float) y0;
-        y[i + 1] = (float) y1;
+        v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
+        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
 
-        x2 += shift[h2 == 510];
-        x3 += shift[h3 == 510];
-        y2 += LOGPOLY( x2 );
-        y3 += LOGPOLY( x3 );
+        v_float32 zf0 = v_fma(xf0, vA0, vA1);
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf0 = v_fma(zf0, xf0, yf0);
 
-        y[i + 2] = (float) y2;
-        y[i + 3] = (float) y3;
+        v_store(y + i, zf0);
     }
+    vx_cleanup();
+#endif
 
     for( ; i < n; i++ )
     {
-        int h0 = x[i];
-        double y0;
-        float x0;
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+        Cv32suf buf;
+        int i0 = x[i];
 
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
+        int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
 
-        y0 += icvLogTab[h0];
-        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x0 += shift[h0 == 510];
-        y0 += LOGPOLY( x0 );
-
-        y[i] = (float)y0;
+        float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
+        float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
+        y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
     }
 }
 
@@ -1356,7 +1082,7 @@ void log64f( const double *x, double *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
-    static const double shift[] = { 0, -1./512 };
+    static const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;
     static const double
     A7 = 1.0,
     A6 = -0.5,
@@ -1367,175 +1093,69 @@ void log64f( const double *x, double *y, int n )
     A1 = 0.1428571428571428769682682968777953647077083587646484375,
     A0 = -0.125;
 
-#undef LOGPOLY
-#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
-(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
-(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
-
     int i = 0;
-    DBLINT buf[4];
-    DBLINT *X = (DBLINT *) x;
 
-#if CV_SSE2
-    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-    static const __m128d _1_2 = _mm_set1_pd(1.);
-    static const __m128d shift2 = _mm_set1_pd(-1./512);
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    static const v_float64 vln2 = vx_setall_f64(ln_2);
 
-    static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
-    static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+    static const v_float64
+        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
+        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
+        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
+        vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);
 
-    static const __m128d mA0 = _mm_set1_pd(A0);
-    static const __m128d mA1 = _mm_set1_pd(A1);
-    static const __m128d mA2 = _mm_set1_pd(A2);
-    static const __m128d mA3 = _mm_set1_pd(A3);
-    static const __m128d mA4 = _mm_set1_pd(A4);
-    static const __m128d mA5 = _mm_set1_pd(A5);
-    static const __m128d mA6 = _mm_set1_pd(A6);
-    static const __m128d mA7 = _mm_set1_pd(A7);
-
-    int CV_DECL_ALIGNED(16) idx[4];
-
-    for( ; i <= n - 4; i += 4 )
+    for( ; i < n; i += VECSZ )
     {
-        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-        __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
-
-        __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
-        __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
-
-        h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
-
-        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
-                                                  _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
-        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
-
-        h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
-        _mm_store_si128((__m128i*)idx, h0);
-        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-        __m128d t0, t1, t2, t3, t4;
-        t0 = _mm_load_pd(icvLogTab + idx[0]);
-        t2 = _mm_load_pd(icvLogTab + idx[1]);
-        t1 = _mm_unpackhi_pd(t0, t2);
-        t0 = _mm_unpacklo_pd(t0, t2);
-        t2 = _mm_load_pd(icvLogTab + idx[2]);
-        t4 = _mm_load_pd(icvLogTab + idx[3]);
-        t3 = _mm_unpackhi_pd(t2, t4);
-        t2 = _mm_unpacklo_pd(t2, t4);
-
-        yd0 = _mm_add_pd(yd0, t0);
-        yd1 = _mm_add_pd(yd1, t2);
-
-        xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
-        xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
-
-        xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
-        xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
-
-        __m128d zd0 = _mm_mul_pd(xd0, mA0);
-        __m128d zd1 = _mm_mul_pd(xd1, mA0);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
-
-        yd0 = _mm_add_pd(yd0, zd0);
-        yd1 = _mm_add_pd(yd1, zd1);
-
-        _mm_storeu_pd(y + i, yd0);
-        _mm_storeu_pd(y + i + 2, yd1);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double xq;
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = X[i].i.lo;
-        h1 = X[i + 1].i.lo;
-        buf[0].i.lo = h0;
-        buf[1].i.lo = h1;
-
-        h0 = X[i].i.hi;
-        h1 = X[i + 1].i.hi;
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
-
-        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = X[i + 2].i.lo;
-        h3 = X[i + 3].i.lo;
-        buf[2].i.lo = h2;
-        buf[3].i.lo = h3;
-
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = X[i + 2].i.hi;
-        h3 = X[i + 3].i.hi;
-
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
-
-        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
-        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
-
-        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
-        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        if( i + VECSZ > n )
+        {
+            if( i == 0 || x == y )
+                break;
+            i = n - VECSZ;
+        }
 
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
+        v_int64 h0 = vx_load((const int64*)x + i);
+        v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
+        yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
 
-        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
+        v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
+        h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
+        v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
 
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y1 += LOGPOLY( x1, h1 == 510 );
+        v_float64 xf0, yf0;
+        v_lut_deinterleave(logTab, idx, yf0, xf0);
 
-        y[i] = y0;
-        y[i + 1] = y1;
+        yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
+        v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
+        xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
 
-        y2 += LOGPOLY( x2, h2 == 510 );
-        y3 += LOGPOLY( x3, h3 == 510 );
+        v_float64 xq = xf0*xf0;
+        v_float64 zf0 = v_fma(xq, vA0, vA2);
+        v_float64 zf1 = v_fma(xq, vA1, vA3);
+        zf0 = v_fma(zf0, xq, vA4);
+        zf1 = v_fma(zf1, xq, vA5);
+        zf0 = v_fma(zf0, xq, vA6);
+        zf1 = v_fma(zf1, xq, vA7);
+        zf1 = v_fma(zf1, xf0, yf0);
+        zf0 = v_fma(zf0, xq, zf1);
 
-        y[i + 2] = y2;
-        y[i + 3] = y3;
+        v_store(y + i, zf0);
     }
+#endif
 
     for( ; i < n; i++ )
     {
-        int h0 = X[i].i.hi;
-        double xq;
-        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[0].i.lo = X[i].i.lo;
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y[i] = y0;
+        Cv64suf buf;
+        int64 i0 = ((const int64*)x)[i];
+
+        buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);
+        int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
+
+        double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];
+        double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);
+
+        double xq = x0*x0;
+        y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;
     }
 }
 
@@ -1543,7 +1163,7 @@ void log64f( const double *x, double *y, int n )
 
 float fastAtan2( float y, float x )
 {
-    return atanImpl<float>(y, x);
+    return atan_f32(y, x);
 }
 
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
diff --git a/modules/core/src/pca.cpp b/modules/core/src/pca.cpp
index 5fdce55cea..0625419a70 100644
--- a/modules/core/src/pca.cpp
+++ b/modules/core/src/pca.cpp
@@ -360,6 +360,19 @@ void cv::PCACompute(InputArray data, InputOutputArray mean,
     pca.eigenvectors.copyTo(eigenvectors);
 }
 
+void cv::PCACompute(InputArray data, InputOutputArray mean,
+                    OutputArray eigenvectors, OutputArray eigenvalues,
+                    int maxComponents)
+{
+    CV_INSTRUMENT_REGION()
+
+    PCA pca;
+    pca(data, mean, 0, maxComponents);
+    pca.mean.copyTo(mean);
+    pca.eigenvectors.copyTo(eigenvectors);
+    pca.eigenvalues.copyTo(eigenvalues);
+}
+
 void cv::PCACompute(InputArray data, InputOutputArray mean,
                     OutputArray eigenvectors, double retainedVariance)
 {
@@ -371,6 +384,19 @@ void cv::PCACompute(InputArray data, InputOutputArray mean,
     pca.eigenvectors.copyTo(eigenvectors);
 }
 
+void cv::PCACompute(InputArray data, InputOutputArray mean,
+                    OutputArray eigenvectors, OutputArray eigenvalues,
+                    double retainedVariance)
+{
+    CV_INSTRUMENT_REGION()
+
+    PCA pca;
+    pca(data, mean, 0, retainedVariance);
+    pca.mean.copyTo(mean);
+    pca.eigenvectors.copyTo(eigenvectors);
+    pca.eigenvalues.copyTo(eigenvalues);
+}
+
 void cv::PCAProject(InputArray data, InputArray mean,
                     InputArray eigenvectors, OutputArray result)
 {
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index 7b7f8ed9b6..a456c72633 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -511,6 +511,8 @@ static RandnScaleFunc randnScaleTab[] =
 void RNG::fill( InputOutputArray _mat, int disttype,
                 InputArray _param1arg, InputArray _param2arg, bool saturateRange )
 {
+    if (_mat.empty())
+        return;
     Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
     int depth = mat.depth(), cn = mat.channels();
     AutoBuffer<double> _parambuf;
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 4171babc03..9a1130fe96 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -241,9 +241,9 @@ TEST(hal_intrin, float64x2) {
 }
 #endif
 
-TEST(hal_intrin,float16x4)
+TEST(hal_intrin,float16)
 {
-    CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ());
+    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
     throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
 
diff --git a/modules/core/test/test_intrin.fp16.cpp b/modules/core/test/test_intrin.fp16.cpp
index 7855fda287..893c5f147a 100644
--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@@ -7,9 +7,9 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void test_hal_intrin_float16x4()
+void test_hal_intrin_float16()
 {
-    TheTest<v_float16x4>()
+    TheTest<v_float16x8>()
         .test_loadstore_fp16()
         .test_float_cvt_fp16()
         ;
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 7579d9cf05..2f8c1cf0b7 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -6,7 +6,7 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void test_hal_intrin_float16x4();
+void test_hal_intrin_float16();
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
@@ -50,6 +50,8 @@ template <> struct initializer<2>
 template <typename R> struct Data
 {
     typedef typename R::lane_type LaneType;
+    typedef typename V_TypeTraits<LaneType>::int_type int_type;
+
     Data()
     {
         for (int i = 0; i < R::nlanes; ++i)
@@ -104,6 +106,17 @@ template <typename R> struct Data
         CV_Assert(i >= 0 && i < R::nlanes);
         return d[i];
     }
+    int_type as_int(int i) const
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        union
+        {
+            LaneType l;
+            int_type i;
+        } v;
+        v.l = d[i];
+        return v.i;
+    }
     const LaneType * mid() const
     {
         return d + R::nlanes / 2;
@@ -247,8 +260,9 @@ template<typename R> struct TheTest
         EXPECT_EQ(d, res);
 
         // zero, all
-        Data<R> resZ = V_RegTrait128<LaneType>::zero();
-        Data<R> resV = V_RegTrait128<LaneType>::all(8);
+        Data<R> resZ, resV;
+        resZ.fill((LaneType)0);
+        resV.fill((LaneType)8);
         for (int i = 0; i < R::nlanes; ++i)
         {
             EXPECT_EQ((LaneType)0, resZ[i]);
@@ -339,7 +353,7 @@ template<typename R> struct TheTest
     // v_expand and v_load_expand
     TheTest & test_expand()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         Data<R> dataA;
         R a = dataA;
 
@@ -362,7 +376,7 @@ template<typename R> struct TheTest
 
     TheTest & test_expand_q()
     {
-        typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
+        typedef typename V_RegTraits<R>::q_reg Rx4;
         Data<R> data;
         Data<Rx4> out = v_load_expand_q(data.d);
         const int n = Rx4::nlanes;
@@ -436,7 +450,7 @@ template<typename R> struct TheTest
 
     TheTest & test_mul_expand()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
         Rx2 c, d;
@@ -456,7 +470,7 @@ template<typename R> struct TheTest
 
     TheTest & test_abs()
     {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
         typedef typename Ru::lane_type u_type;
         Data<R> dataA, dataB(10);
         R a = dataA, b = dataB;
@@ -520,7 +534,7 @@ template<typename R> struct TheTest
 
     TheTest & test_dot_prod()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
 
         Data<R> dataA, dataB(2);
@@ -608,7 +622,7 @@ template<typename R> struct TheTest
 
     TheTest & test_absdiff()
     {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
         typedef typename Ru::lane_type u_type;
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
@@ -657,12 +671,21 @@ template<typename R> struct TheTest
 
     TheTest & test_mask()
     {
-        typedef V_TypeTraits<LaneType> Traits;
-        typedef typename Traits::int_type int_type;
+        typedef typename V_RegTraits<R>::int_reg int_reg;
+        typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
+        typedef typename int_reg::lane_type int_type;
+        typedef typename uint_reg::lane_type uint_type;
 
         Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
         dataA[1] *= (LaneType)-1;
-        const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
+        union
+        {
+            LaneType l;
+            uint_type ui;
+        }
+        all1s;
+        all1s.ui = (uint_type)-1;
+        LaneType mask_one = all1s.l;
         dataB[1] = mask_one;
         dataB[R::nlanes / 2] = mask_one;
         dataB[R::nlanes - 1] = mask_one;
@@ -684,10 +707,8 @@ template<typename R> struct TheTest
         Data<R> resF = f;
         for (int i = 0; i < R::nlanes; ++i)
         {
-            int_type m2 = Traits::reinterpret_int(dataB[i]);
-            EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
-                    | (Traits::reinterpret_int(dataE[i]) & ~m2),
-                      Traits::reinterpret_int(resF[i]));
+            int_type m2 = dataB.as_int(i);
+            EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
         }
 
         return *this;
@@ -697,7 +718,7 @@ template<typename R> struct TheTest
     TheTest & test_pack()
     {
         SCOPED_TRACE(s);
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
         Data<Rx2> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
@@ -734,8 +755,9 @@ template<typename R> struct TheTest
     TheTest & test_pack_u()
     {
         SCOPED_TRACE(s);
-        typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
-        typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
+        //typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
+        typedef typename V_RegTraits<R>::w_reg R2;
+        typedef typename V_RegTraits<R2>::int_reg Ri2;
         typedef typename Ri2::lane_type w_type;
 
         Data<Ri2> dataA, dataB;
@@ -864,7 +886,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_math()
     {
-        typedef typename V_RegTrait128<LaneType>::int_reg Ri;
+        typedef typename V_RegTraits<R>::round_reg Ri;
         Data<R> data1, data2, data3;
         data1 *= 1.1;
         data2 += 10;
@@ -1005,31 +1027,28 @@ template<typename R> struct TheTest
 
     TheTest & test_loadstore_fp16()
     {
-#if CV_FP16 && CV_SIMD128
+#if CV_FP16 && CV_SIMD
         AlignedData<R> data;
         AlignedData<R> out;
 
-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */ )
-        {
-            // check if addresses are aligned and unaligned respectively
-            EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
-            EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
-
-            // check some initialization methods
-            R r1 = data.u;
-            R r2 = v_load_f16(data.a.d);
-            R r3(r2);
-            EXPECT_EQ(data.u[0], r1.get0());
-            EXPECT_EQ(data.a[0], r2.get0());
-            EXPECT_EQ(data.a[0], r3.get0());
-
-            // check some store methods
-            out.a.clear();
-            v_store_f16(out.a.d, r1);
-            EXPECT_EQ(data.a, out.a);
-        }
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.u;
+        R r2 = v_load_f16(data.a.d);
+        R r3(r2);
+        EXPECT_EQ(data.u[0], r1.get0());
+        EXPECT_EQ(data.a[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+
+        // check some store methods
+        out.a.clear();
+        v_store(out.a.d, r1);
+        EXPECT_EQ(data.a, out.a);
 
         return *this;
 #endif
@@ -1037,18 +1056,15 @@ template<typename R> struct TheTest
 
     TheTest & test_float_cvt_fp16()
     {
-#if CV_FP16 && CV_SIMD128
-        AlignedData<v_float32x4> data;
-
-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
-        {
-            // check conversion
-            v_float32x4 r1 = v_load(data.a.d);
-            v_float16x4 r2 = v_cvt_f16(r1);
-            v_float32x4 r3 = v_cvt_f32(r2);
-            EXPECT_EQ(0x3c00, r2.get0());
-            EXPECT_EQ(r3.get0(), r1.get0());
-        }
+#if CV_FP16 && CV_SIMD
+        AlignedData<v_float32> data;
+
+        // check conversion
+        v_float32 r1 = vx_load(data.a.d);
+        v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
+        v_float32 r3 = v_cvt_f32(r2);
+        EXPECT_EQ(0x3c00, r2.get0());
+        EXPECT_EQ(r3.get0(), r1.get0());
 
         return *this;
 #endif
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 896df47a15..68dfc3c969 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -134,7 +134,9 @@ double Core_PowTest::get_success_error_level( int test_case_idx, int i, int j )
     if( depth < CV_32F )
         return power == cvRound(power) && power >= 0 ? 0 : 1;
     else
-        return Base::get_success_error_level( test_case_idx, i, j );
+    {
+        return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1;
+    }
 }
 
 
@@ -3129,6 +3131,75 @@ TEST(Core_QR_Solver, accuracy64f)
     ASSERT_FALSE(solve(A, B, solutionQR, DECOMP_QR));
 }
 
+TEST(Core_Solve, regression_11888)
+{
+    cv::Matx<float, 3, 2> A(
+        2, 1,
+        3, 1,
+        6, 1
+    );
+    cv::Vec<float, 3> b(4, 5, 7);
+    cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 0.001);
+    cv::Matx<float, 2, 3> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+    EXPECT_ANY_THROW({
+       /*cv::Matx<float, 2, 1> xLU =*/ A.solve(b, DECOMP_LU);
+       std::cout << "FATAL ERROR" << std::endl;
+    });
+}
+
+TEST(Core_Solve, Matx_2_2)
+{
+    cv::Matx<float, 2, 2> A(
+        2, 1,
+        1, 1
+    );
+    cv::Vec<float, 2> b(4, 5);
+    cv::Matx<float, 2, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 2, 2> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+}
+TEST(Core_Solve, Matx_3_3)
+{
+    cv::Matx<float, 3, 3> A(
+        2, 1, 0,
+        0, 1, 1,
+        1, 0, 1
+    );
+    cv::Vec<float, 3> b(4, 5, 6);
+    cv::Matx<float, 3, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 3, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 3, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 3, 3> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 3, 3>::eye(), NORM_L2), 1e-3);
+}
+
+TEST(Core_Solve, Matx_4_4)
+{
+    cv::Matx<float, 4, 4> A(
+        2, 1, 0, 4,
+        0, 1, 1, 3,
+        1, 0, 1, 2,
+        2, 2, 0, 1
+    );
+    cv::Vec<float, 4> b(4, 5, 6, 7);
+    cv::Matx<float, 4, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 4, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 4, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 4, 4> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 4, 4>::eye(), NORM_L2), 1e-3);
+}
+
 softdouble naiveExp(softdouble x)
 {
     int exponent = x.getExp();
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index bb1aab3328..1bdf516a16 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -794,13 +794,13 @@ bool CV_OperationsTest::TestTemplateMat()
 
         Size size(2, 5);
         TestType<float>(size, 1.f);
-        cv::Vec3f val1 = 1.f;
+        cv::Vec3f val1(1.f);
         TestType<cv::Vec3f>(size, val1);
-        cv::Matx31f val2 = 1.f;
+        cv::Matx31f val2(1.f);
         TestType<cv::Matx31f>(size, val2);
-        cv::Matx41f val3 = 1.f;
+        cv::Matx41f val3(1.f);
         TestType<cv::Matx41f>(size, val3);
-        cv::Matx32f val4 = 1.f;
+        cv::Matx32f val4(1.f);
         TestType<cv::Matx32f>(size, val4);
     }
     catch (const test_excep& e)
diff --git a/modules/core/test/test_rand.cpp b/modules/core/test/test_rand.cpp
index 82bb6104a9..8677aa0c31 100644
--- a/modules/core/test/test_rand.cpp
+++ b/modules/core/test/test_rand.cpp
@@ -168,11 +168,12 @@ void Core_RandTest::run( int )
         {
             tested_rng = saved_rng;
             int sz = 0, dsz = 0, slice;
-            for( slice = 0; slice < maxSlice; slice++, sz += dsz )
+            for( slice = 0; slice < maxSlice && sz < SZ; slice++, sz += dsz )
             {
-                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz + 1)) : SZ - sz;
+                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
                 Mat aslice = arr[k].colRange(sz, sz + dsz);
                 tested_rng.fill(aslice, dist_type, A, B);
+                printf("%d - %d\n", sz, sz + dsz);
             }
         }
 
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index f65f503529..7cc95ca0c4 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -644,6 +644,24 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     */
     CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());
 
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @returns Net object.
+     */
+    CV_EXPORTS_W Net readNetFromDarknet(const std::vector<uchar>& bufferCfg,
+                                        const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param lenCfg      Number of bytes to read from bufferCfg
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @param lenModel    Number of bytes to read from bufferModel
+     *  @returns Net object.
+     */
+    CV_EXPORTS Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg,
+                                      const char *bufferModel = NULL, size_t lenModel = 0);
+
     /** @brief Reads a network model stored in <a href="http://caffe.berkeleyvision.org">Caffe</a> framework's format.
       * @param prototxt   path to the .prototxt file with text description of the network architecture.
       * @param caffeModel path to the .caffemodel file with learned network.
@@ -651,6 +669,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
       */
     CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());
 
+    /** @brief Reads a network model stored in Caffe model in memory.
+      * @param bufferProto buffer containing the content of the .prototxt file
+      * @param bufferModel buffer containing the content of the .caffemodel file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromCaffe(const std::vector<uchar>& bufferProto,
+                                      const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
     /** @brief Reads a network model stored in Caffe model in memory.
       * @details This is an overloaded member function, provided for convenience.
       * It differs from the above function only in what argument(s) it accepts.
@@ -672,6 +698,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
       */
     CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
 
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
+      * @param bufferModel buffer containing the content of the pb file
+      * @param bufferConfig buffer containing the content of the pbtxt file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTensorflow(const std::vector<uchar>& bufferModel,
+                                           const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
       * @details This is an overloaded member function, provided for convenience.
       * It differs from the above function only in what argument(s) it accepts.
@@ -735,6 +769,18 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
       */
      CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");
 
+     /**
+      * @brief Read deep learning network represented in one of the supported formats.
+      * @details This is an overloaded member function, provided for convenience.
+      *          It differs from the above function only in what argument(s) it accepts.
+      * @param[in] framework    Name of origin framework.
+      * @param[in] bufferModel  A buffer with a content of binary file with weights
+      * @param[in] bufferConfig A buffer with a content of text file contains network configuration.
+      * @returns Net object.
+      */
+     CV_EXPORTS_W Net readNet(const String& framework, const std::vector<uchar>& bufferModel,
+                              const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
     /** @brief Loads blob which was serialized as torch.Tensor object of Torch7 framework.
      *  @warning This function has the same limitations as readNetFromTorch().
      */
diff --git a/modules/dnn/misc/java/test/DnnTensorFlowTest.java b/modules/dnn/misc/java/test/DnnTensorFlowTest.java
index 5dd423649e..4e96c73e28 100644
--- a/modules/dnn/misc/java/test/DnnTensorFlowTest.java
+++ b/modules/dnn/misc/java/test/DnnTensorFlowTest.java
@@ -1,10 +1,14 @@
 package org.opencv.test.dnn;
 
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.opencv.core.Core;
 import org.opencv.core.Mat;
+import org.opencv.core.MatOfFloat;
+import org.opencv.core.MatOfByte;
 import org.opencv.core.Scalar;
 import org.opencv.core.Size;
 import org.opencv.dnn.DictValue;
@@ -26,6 +30,15 @@ public class DnnTensorFlowTest extends OpenCVTestCase {
 
     Net net;
 
+    private static void normAssert(Mat ref, Mat test) {
+        final double l1 = 1e-5;
+        final double lInf = 1e-4;
+        double normL1 = Core.norm(ref, test, Core.NORM_L1) / ref.total();
+        double normLInf = Core.norm(ref, test, Core.NORM_INF) / ref.total();
+        assertTrue(normL1 < l1);
+        assertTrue(normLInf < lInf);
+    }
+
     @Override
     protected void setUp() throws Exception {
         super.setUp();
@@ -46,7 +59,7 @@ public class DnnTensorFlowTest extends OpenCVTestCase {
 
         File testDataPath = new File(envTestDataPath);
 
-        File f = new File(testDataPath, "dnn/space_shuttle.jpg");
+        File f = new File(testDataPath, "dnn/grace_hopper_227.png");
         sourceImageFile = f.toString();
         if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
 
@@ -77,31 +90,55 @@ public class DnnTensorFlowTest extends OpenCVTestCase {
 
     }
 
-    public void testTestNetForward() {
-        Mat rawImage = Imgcodecs.imread(sourceImageFile);
-
-        assertNotNull("Loading image from file failed!", rawImage);
+    public void checkInceptionNet(Net net)
+    {
+        Mat image = Imgcodecs.imread(sourceImageFile);
+        assertNotNull("Loading image from file failed!", image);
 
-        Mat image = new Mat();
-        Imgproc.resize(rawImage, image, new Size(224,224));
-
-        Mat inputBlob = Dnn.blobFromImage(image);
+        Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
         assertNotNull("Converting image to blob failed!", inputBlob);
 
-        Mat inputBlobP = new Mat();
-        Core.subtract(inputBlob, new Scalar(117.0), inputBlobP);
-
-        net.setInput(inputBlobP, "input" );
-
-        Mat result = net.forward();
+        net.setInput(inputBlob, "input");
 
+        Mat result = new Mat();
+        try {
+            net.setPreferableBackend(Dnn.DNN_BACKEND_OPENCV);
+            result = net.forward("softmax2");
+        }
+        catch (Exception e) {
+            fail("DNN forward failed: " + e.getMessage());
+        }
         assertNotNull("Net returned no result!", result);
 
-        Core.MinMaxLocResult minmax = Core.minMaxLoc(result.reshape(1, 1));
+        result = result.reshape(1, 1);
+        Core.MinMaxLocResult minmax = Core.minMaxLoc(result);
+        assertEquals("Wrong prediction", (int)minmax.maxLoc.x, 866);
+
+        Mat top5RefScores = new MatOfFloat(new float[] {
+            0.63032645f, 0.2561979f, 0.032181446f, 0.015721032f, 0.014785315f
+        }).reshape(1, 1);
 
-        assertTrue("No image recognized!", minmax.maxVal > 0.9);
+        Core.sort(result, result, Core.SORT_DESCENDING);
 
+        normAssert(result.colRange(0, 5), top5RefScores);
+    }
 
+    public void testTestNetForward() {
+        checkInceptionNet(net);
     }
 
+    public void testReadFromBuffer() {
+        File modelFile = new File(modelFileName);
+        byte[] modelBuffer = new byte[ (int)modelFile.length() ];
+
+        try {
+            FileInputStream fis = new FileInputStream(modelFile);
+            fis.read(modelBuffer);
+            fis.close();
+        } catch (IOException e) {
+            fail("Failed to read a model: " + e.getMessage());
+        }
+        net = Dnn.readNetFromTensorflow(new MatOfByte(modelBuffer));
+        checkInceptionNet(net);
+    }
 }
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 37db7f039a..59f47eef1a 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -453,6 +453,15 @@ Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
     return net;
 }
 
+Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
+                            bufferModelPtr, bufferModel.size());
+}
+
 #endif //HAVE_PROTOBUF
 
 CV__DNN_EXPERIMENTAL_NS_END
diff --git a/modules/dnn/src/darknet/darknet_importer.cpp b/modules/dnn/src/darknet/darknet_importer.cpp
index 8bd64d099c..282b37277c 100644
--- a/modules/dnn/src/darknet/darknet_importer.cpp
+++ b/modules/dnn/src/darknet/darknet_importer.cpp
@@ -44,6 +44,7 @@
 #include "../precomp.hpp"
 
 #include <iostream>
+#include <fstream>
 #include <algorithm>
 #include <vector>
 #include <map>
@@ -66,14 +67,19 @@ public:
 
     DarknetImporter() {}
 
-    DarknetImporter(const char *cfgFile, const char *darknetModel)
+    DarknetImporter(std::istream &cfgStream, std::istream &darknetModelStream)
     {
         CV_TRACE_FUNCTION();
 
-        ReadNetParamsFromCfgFileOrDie(cfgFile, &net);
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
+        ReadNetParamsFromBinaryStreamOrDie(darknetModelStream, &net);
+    }
 
-        if (darknetModel && darknetModel[0])
-            ReadNetParamsFromBinaryFileOrDie(darknetModel, &net);
+    DarknetImporter(std::istream &cfgStream)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
     }
 
     struct BlobNote
@@ -175,15 +181,75 @@ public:
     }
 };
 
+static Net readNetFromDarknet(std::istream &cfgFile, std::istream &darknetModel)
+{
+    Net net;
+    DarknetImporter darknetImporter(cfgFile, darknetModel);
+    darknetImporter.populateNet(net);
+    return net;
 }
 
-Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= String()*/)
+static Net readNetFromDarknet(std::istream &cfgFile)
 {
-    DarknetImporter darknetImporter(cfgFile.c_str(), darknetModel.c_str());
     Net net;
+    DarknetImporter darknetImporter(cfgFile);
     darknetImporter.populateNet(net);
     return net;
 }
 
+}
+
+Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= String()*/)
+{
+    std::ifstream cfgStream(cfgFile.c_str());
+    if (!cfgStream.is_open())
+    {
+        CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+    }
+    if (darknetModel != String())
+    {
+        std::ifstream darknetModelStream(darknetModel.c_str(), std::ios::binary);
+        if (!darknetModelStream.is_open())
+        {
+            CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(darknetModel));
+        }
+        return readNetFromDarknet(cfgStream, darknetModelStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+struct BufferStream : public std::streambuf
+{
+    BufferStream(const char* s, std::size_t n)
+    {
+        char* ptr = const_cast<char*>(s);
+        setg(ptr, ptr, ptr + n);
+    }
+};
+
+Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg, const char *bufferModel, size_t lenModel)
+{
+    BufferStream cfgBufferStream(bufferCfg, lenCfg);
+    std::istream cfgStream(&cfgBufferStream);
+    if (lenModel)
+    {
+        BufferStream weightsBufferStream(bufferModel, lenModel);
+        std::istream weightsStream(&weightsBufferStream);
+        return readNetFromDarknet(cfgStream, weightsStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+Net readNetFromDarknet(const std::vector<uchar>& bufferCfg, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferCfgPtr = reinterpret_cast<const char*>(&bufferCfg[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromDarknet(bufferCfgPtr, bufferCfg.size(),
+                              bufferModelPtr, bufferModel.size());
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }} // namespace
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 03805dd364..815b84f651 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -476,68 +476,61 @@ namespace cv {
                 return dst;
             }
 
-            bool ReadDarknetFromCfgFile(const char *cfgFile, NetParameter *net)
+            bool ReadDarknetFromCfgStream(std::istream &ifile, NetParameter *net)
             {
-                std::ifstream ifile;
-                ifile.open(cfgFile);
-                if (ifile.is_open())
-                {
-                    bool read_net = false;
-                    int layers_counter = -1;
-                    for (std::string line; std::getline(ifile, line);) {
-                        line = escapeString(line);
-                        if (line.empty()) continue;
-                        switch (line[0]) {
-                        case '\0': break;
-                        case '#': break;
-                        case ';': break;
-                        case '[':
-                            if (line == "[net]") {
-                                read_net = true;
-                            }
-                            else {
-                                // read section
-                                read_net = false;
-                                ++layers_counter;
-                                const size_t layer_type_size = line.find("]") - 1;
-                                CV_Assert(layer_type_size < line.size());
-                                std::string layer_type = line.substr(1, layer_type_size);
-                                net->layers_cfg[layers_counter]["type"] = layer_type;
-                            }
-                            break;
-                        default:
-                            // read entry
-                            const size_t separator_index = line.find('=');
-                            CV_Assert(separator_index < line.size());
-                            if (separator_index != std::string::npos) {
-                                std::string name = line.substr(0, separator_index);
-                                std::string value = line.substr(separator_index + 1, line.size() - (separator_index + 1));
-                                name = escapeString(name);
-                                value = escapeString(value);
-                                if (name.empty() || value.empty()) continue;
-                                if (read_net)
-                                    net->net_cfg[name] = value;
-                                else
-                                    net->layers_cfg[layers_counter][name] = value;
-                            }
+                bool read_net = false;
+                int layers_counter = -1;
+                for (std::string line; std::getline(ifile, line);) {
+                    line = escapeString(line);
+                    if (line.empty()) continue;
+                    switch (line[0]) {
+                    case '\0': break;
+                    case '#': break;
+                    case ';': break;
+                    case '[':
+                        if (line == "[net]") {
+                            read_net = true;
+                        }
+                        else {
+                            // read section
+                            read_net = false;
+                            ++layers_counter;
+                            const size_t layer_type_size = line.find("]") - 1;
+                            CV_Assert(layer_type_size < line.size());
+                            std::string layer_type = line.substr(1, layer_type_size);
+                            net->layers_cfg[layers_counter]["type"] = layer_type;
+                        }
+                        break;
+                    default:
+                        // read entry
+                        const size_t separator_index = line.find('=');
+                        CV_Assert(separator_index < line.size());
+                        if (separator_index != std::string::npos) {
+                            std::string name = line.substr(0, separator_index);
+                            std::string value = line.substr(separator_index + 1, line.size() - (separator_index + 1));
+                            name = escapeString(name);
+                            value = escapeString(value);
+                            if (name.empty() || value.empty()) continue;
+                            if (read_net)
+                                net->net_cfg[name] = value;
+                            else
+                                net->layers_cfg[layers_counter][name] = value;
                         }
                     }
-
-                    std::string anchors = net->layers_cfg[net->layers_cfg.size() - 1]["anchors"];
-                    std::vector<float> vec = getNumbers<float>(anchors);
-                    std::map<std::string, std::string> &net_params = net->net_cfg;
-                    net->width = getParam(net_params, "width", 416);
-                    net->height = getParam(net_params, "height", 416);
-                    net->channels = getParam(net_params, "channels", 3);
-                    CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);
                 }
-                else
-                    return false;
+
+                std::string anchors = net->layers_cfg[net->layers_cfg.size() - 1]["anchors"];
+                std::vector<float> vec = getNumbers<float>(anchors);
+                std::map<std::string, std::string> &net_params = net->net_cfg;
+                net->width = getParam(net_params, "width", 416);
+                net->height = getParam(net_params, "height", 416);
+                net->channels = getParam(net_params, "channels", 3);
+                CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);
 
                 int current_channels = net->channels;
                 net->out_channels_vec.resize(net->layers_cfg.size());
 
-                int layers_counter = -1;
+                layers_counter = -1;
 
                 setLayersParams setParams(net);
 
@@ -676,13 +669,8 @@ namespace cv {
                 return true;
             }
 
-
-            bool ReadDarknetFromWeightsFile(const char *darknetModel, NetParameter *net)
+            bool ReadDarknetFromWeightsStream(std::istream &ifile, NetParameter *net)
             {
-                std::ifstream ifile;
-                ifile.open(darknetModel, std::ios::binary);
-                CV_Assert(ifile.is_open());
-
                 int32_t major_ver, minor_ver, revision;
                 ifile.read(reinterpret_cast<char *>(&major_ver), sizeof(int32_t));
                 ifile.read(reinterpret_cast<char *>(&minor_ver), sizeof(int32_t));
@@ -778,19 +766,18 @@ namespace cv {
         }
 
 
-        void ReadNetParamsFromCfgFileOrDie(const char *cfgFile, darknet::NetParameter *net)
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
         {
-            if (!darknet::ReadDarknetFromCfgFile(cfgFile, net)) {
-                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+            if (!darknet::ReadDarknetFromCfgStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
             }
         }
 
-        void ReadNetParamsFromBinaryFileOrDie(const char *darknetModel, darknet::NetParameter *net)
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
         {
-            if (!darknet::ReadDarknetFromWeightsFile(darknetModel, net)) {
-                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(darknetModel));
+            if (!darknet::ReadDarknetFromWeightsStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
             }
         }
-
     }
 }
diff --git a/modules/dnn/src/darknet/darknet_io.hpp b/modules/dnn/src/darknet/darknet_io.hpp
index 5859f736b6..f783ca7b49 100644
--- a/modules/dnn/src/darknet/darknet_io.hpp
+++ b/modules/dnn/src/darknet/darknet_io.hpp
@@ -109,10 +109,9 @@ namespace cv {
             };
         }
 
-        // Read parameters from a file into a NetParameter message.
-        void ReadNetParamsFromCfgFileOrDie(const char *cfgFile, darknet::NetParameter *net);
-        void ReadNetParamsFromBinaryFileOrDie(const char *darknetModel, darknet::NetParameter *net);
-
+        // Read parameters from a stream into a NetParameter message.
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
     }
 }
 #endif
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 011631f54e..994df854b0 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1492,7 +1492,8 @@ struct Net::Impl
             // TODO: OpenCL target support more fusion styles.
             if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                  (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
+                 ld.layerInstance->type != "Concat")) )
                 continue;
 
             Ptr<Layer>& currLayer = ld.layerInstance;
@@ -1701,6 +1702,31 @@ struct Net::Impl
                 ld.outputBlobs.size() == 1 )
             {
                 Mat& output = ld.outputBlobs[0];
+                UMat umat_output;
+                if (!ld.outputBlobsWrappers.empty() &&
+                    (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    bool conv_layer = true;
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skip &&
+                              inp_i_data->inputBlobsId.size() == 1 &&
+                              inp_i_data->consumers.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
+                    }
+                    if (!conv_layer)
+                        continue;
+                    std::vector<UMat> umat_outputBlobs;
+                    umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+                    umat_output = umat_outputBlobs[0];
+                }
 
                 // TODO: in general, this optimization can always be done, but
                 // many layers currently check that the input/output blobs are
@@ -1737,6 +1763,14 @@ struct Net::Impl
                         // Allocate new memory to prevent collisions during memory
                         // reusing (see https://github.com/opencv/opencv/pull/10456).
                         output = output.clone();
+                        if (preferableBackend == DNN_BACKEND_OPENCV &&
+                            IS_DNN_OPENCL_TARGET(preferableTarget))
+                        {
+                            std::vector<UMat> umats(1);
+                            umat_output = umat_output.clone();
+                            umats[0] = umat_output;
+                            OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
+                        }
                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
                         int ofs = 0;
                         for( i = 0; i < ninputs; i++ )
@@ -1753,6 +1787,12 @@ struct Net::Impl
                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
                             Mat* oldPtr = &curr_output;
                             curr_output = output_slice;
+                            if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
+                            {
+                                std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
+                                umats[pin.oid] = umat_output(chrange);
+                                OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
+                            }
                             // Layers that refer old input Mat will refer to the
                             // new data but the same Mat object.
                             CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output);
@@ -3086,6 +3126,23 @@ Net readNet(const String& _model, const String& _config, const String& _framewor
                                       model + (config.empty() ? "" : ", " + config));
 }
 
+Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
+            const std::vector<uchar>& bufferConfig)
+{
+    String framework = _framework.toLowerCase();
+    if (framework == "caffe")
+        return readNetFromCaffe(bufferConfig, bufferModel);
+    else if (framework == "tensorflow")
+        return readNetFromTensorflow(bufferModel, bufferConfig);
+    else if (framework == "darknet")
+        return readNetFromDarknet(bufferConfig, bufferModel);
+    else if (framework == "torch")
+        CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
+    else if (framework == "dldt")
+        CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
+    CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
+}
+
 Net readNetFromModelOptimizer(const String &xml, const String &bin)
 {
     return Net::readFromModelOptimizer(xml, bin);
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index c6e011b5dd..f4d4d2b822 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -295,7 +295,9 @@ public:
         for (int i = 0; i < num; i++)
             confPreds.push_back(Mat(2, shape, CV_32F));
 
-        UMat umat = inp1.reshape(1, num * numPredsPerClass);
+        shape[0] = num * numPredsPerClass;
+        shape[1] = inp1.total() / shape[0];
+        UMat umat = inp1.reshape(1, 2, &shape[0]);
         for (int i = 0; i < num; ++i)
         {
             Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
@@ -342,7 +344,7 @@ public:
             // Decode all loc predictions to bboxes
             bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
                                            _shareLocation, _numLocClasses, _backgroundLabelId,
-                                           _codeType, _varianceEncodedInTarget, false,
+                                           _codeType, _varianceEncodedInTarget, _clip,
                                            allDecodedBBoxes);
             if (!ret)
                 return false;
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 0756f38f80..1e41585672 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -369,15 +369,11 @@ public:
         // clip the prior's coordinate such that it is within [0, 1]
         if (_clip)
         {
-            Mat mat = outputs[0].getMat(ACCESS_READ);
-            int aspect_count = (_maxSize > 0) ? 1 : 0;
-            int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
-            float* outputPtr = mat.ptr<float>() + offset;
-            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
-            for (size_t d = 0; d < _outChannelSize; ++d)
-            {
-                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
-            }
+            ocl::Kernel kernel("clip", ocl::dnn::prior_box_oclsrc, opts);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors * 4;
+            if (!kernel.args((int)nthreads, ocl::KernelArg::PtrReadWrite(outputs[0]))
+                       .run(1, &nthreads, NULL, false))
+                return false;
         }
 
         // set the variance.
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index 4bf7b506a5..b26206694d 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -14,7 +14,7 @@ namespace cv { namespace dnn {
 class ResizeLayerImpl : public ResizeLayer
 {
 public:
-    ResizeLayerImpl(const LayerParams& params)
+    ResizeLayerImpl(const LayerParams& params) : scaleWidth(0), scaleHeight(0)
     {
         setParamsFrom(params);
         outWidth = params.get<float>("width", 0);
diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp
index 6c69d773a4..19c6cfc88e 100644
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -14,6 +14,7 @@ public:
     ShuffleChannelLayerImpl(const LayerParams& params)
     {
         group = params.get<int>("group", 1);
+        setParamsFrom(params);
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 135874812b..5f50289847 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -110,27 +110,26 @@ public:
         outputs_.getUMatVector(outputs);
         internals_.getUMatVector(internals);
 
+        UMat& src = inputs[0];
+        UMat& dstMat = outputs[0];
+        int axis = clamp(axisRaw, src.dims);
+
         if (softmaxOp.empty())
         {
             OCL4DNNSoftmaxConfig config;
-
             config.in_shape = shape(inputs[0]);
-            config.axis = axisRaw;
-            config.channels = inputs[0].size[axisRaw];
+            config.axis = axis;
+            config.channels = inputs[0].size[axis];
             config.logsoftmax = logSoftMax;
             config.use_half = use_half;
 
             softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
         }
 
-        UMat& src = inputs[0];
-        UMat& dstMat = outputs[0];
-
         if (softmaxOp->Forward(src, dstMat))
             return true;
 
         UMat& bufMat = internals[0];
-        int axis = clamp(axisRaw, src.dims);
         MatShape s = shape(src);
         size_t outerSize = total(s, 0, axis);
         size_t channels = src.size[axis];
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
index 8329c148b0..ea5c2c8938 100644
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -612,7 +612,7 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
             ret = k.run(1, globalsize, localsize, false);
         }
 
-        if ((row_size % 4) != 0 && ret)
+        if (row_size < 4 || ((row_size % 4) != 0 && ret))
         {
             String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
             ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 034f8d3e7d..c889c7d85c 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -821,7 +821,7 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
     cl_int err;
     size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
 
-    region.origin = offset * element_size;
+    region.origin = offset * element_size + buffer.offset;
     region.size = size * element_size;
     sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
                                 write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
@@ -853,6 +853,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         return false;
 
     int32_t bias_offset;
+    int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
 
     if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
         if (!swizzleWeight(weight, config->workItem_output[2], false))
@@ -931,10 +932,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                     return false;
 
                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
             }
             else
             {
                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
             }
 
             kernel.set(argIdx++, (uint16_t)width_);
@@ -1024,10 +1027,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                     return false;
 
                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
             }
             else
             {
                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
             }
 
             kernel.set(argIdx++, (uint16_t)width_);
@@ -1079,6 +1084,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         if (bias_term_)
             kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
         kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (int)(top.offset / element_size));
         kernel.set(argIdx++, (uint16_t)width_);
         kernel.set(argIdx++, (uint16_t)height_);
         kernel.set(argIdx++, (uint16_t)output_w_);
@@ -1126,6 +1132,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                     kernel.set(argIdx++, (void *)NULL);
                 kernel.set(argIdx++, bias_offset);
                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
                 kernel.set(argIdx++, output_image_offset);
                 kernel.set(argIdx++, (uint16_t)width_);
                 kernel.set(argIdx++, (uint16_t)height_);
@@ -1230,20 +1237,22 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
     tuned_ = saved_tuned;
 
     UMat new_top, new_verify_top;
-    float *data, *verify_data;
+    Mat mat_top, mat_verify_top;
     if (use_half_)
     {
         convertFp16(top, new_top);
         convertFp16(verifyTop, new_verify_top);
 
-        data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
-        verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
+        mat_top = new_top.getMat(ACCESS_READ);
+        mat_verify_top = new_verify_top.getMat(ACCESS_READ);
     }
     else
     {
-        data = (float *)top.getMat(ACCESS_READ).ptr<float>();
-        verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+        mat_top = top.getMat(ACCESS_READ);
+        mat_verify_top = verifyTop.getMat(ACCESS_READ);
     }
+    const float* data = mat_top.ptr<float>();
+    const float* verify_data = mat_verify_top.ptr<float>();
 
     for (int32_t n = 0; n < num_; ++n) {
         for (int32_t g = 0; g < group_; ++g) {
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 2cc161d3ff..adeb38574e 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -136,7 +136,8 @@ __kernel void ConvolveBasic(
     int kernel_offset,
     __global Dtype* bias,
     const int bias_offset,
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_base_offset,
     const int convolved_image_offset,
     const ushort input_width,
     const ushort input_height,
@@ -146,6 +147,7 @@ __kernel void ConvolveBasic(
     const ushort pad_h
 )
 {
+    __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
     const int outputX = get_global_id(0);
     const int outputY = get_global_id(1);
     const int kernelNum = get_global_id(2) * ZPAR;
@@ -220,12 +222,14 @@ convolve_simd(
     __global Dtype* inputs,
     __global Dtype* weights,
     BIAS_KERNEL_ARG
-    __global Dtype* outputs,
+    __global Dtype* outputs_base,
+    const int outputs_offset,
     const ushort input_width,
     const ushort input_height,
     const ushort output_width,
     const ushort output_height)
 {
+  __global Dtype* outputs = outputs_base + outputs_offset;
   unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
   unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
   unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
@@ -395,7 +399,8 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
     const __global Dtype *src0,   \
     const __global Dtype *src1,   \
     BIAS_KERNEL_ARG               \
-    __global Dtype *dst,          \
+    __global Dtype *dst_base,     \
+    const int dst_offset,         \
     const ushort input_width,     \
     const ushort input_height,    \
     const ushort output_width,    \
@@ -425,6 +430,7 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -813,6 +819,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1374,6 +1381,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1559,6 +1567,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1770,12 +1779,13 @@ __kernel void DWCONV(
     __global Dtype* image_data,
     __global Dtype* kernel_data,
     BIAS_KERNEL_ARG
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_offset,
     const ushort input_width,
     const ushort input_height,
     const ushort output_width,
     const ushort output_height) {
-
+  __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
   const int outputX = get_global_id(0);
   const int outputY = get_global_id(1);
   const int outputZ = get_global_id(2);
diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl
index 6ffbf8df29..d898a13ffd 100644
--- a/modules/dnn/src/opencl/prior_box.cl
+++ b/modules/dnn/src/opencl/prior_box.cl
@@ -107,3 +107,13 @@ __kernel void set_variance(const int nthreads,
         vstore4(var_vec, 0, dst + offset + index * 4);
     }
 }
+
+__kernel void clip(const int nthreads,
+                   __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 vec = vload4(index, dst);
+        vstore4(clamp(vec, 0, 1), index, dst);
+    }
+}
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 7d7d300386..89732b45ad 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1856,5 +1856,14 @@ Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
     return net;
 }
 
+Net readNetFromTensorflow(const std::vector<uchar>& bufferModel, const std::vector<uchar>& bufferConfig)
+{
+    const char* bufferModelPtr = reinterpret_cast<const char*>(&bufferModel[0]);
+    const char* bufferConfigPtr = bufferConfig.empty() ? NULL :
+                                  reinterpret_cast<const char*>(&bufferConfig[0]);
+    return readNetFromTensorflow(bufferModelPtr, bufferModel.size(),
+                                 bufferConfigPtr, bufferConfig.size());
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }} // namespace
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 682213b791..077498d92e 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -65,6 +65,34 @@ TEST(Test_Darknet, read_yolo_voc)
     ASSERT_FALSE(net.empty());
 }
 
+TEST(Test_Darknet, read_yolo_voc_stream)
+{
+    Mat ref;
+    Mat sample = imread(_tf("dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0/255, Size(416, 416), Scalar(), true, false);
+    const std::string cfgFile = findDataFile("dnn/yolo-voc.cfg", false);
+    const std::string weightsFile = findDataFile("dnn/yolo-voc.weights", false);
+    // Import by paths.
+    {
+        Net net = readNetFromDarknet(cfgFile, weightsFile);
+        net.setInput(inp);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        ref = net.forward();
+    }
+    // Import from bytes array.
+    {
+        std::string cfg, weights;
+        readFileInMemory(cfgFile, cfg);
+        readFileInMemory(weightsFile, weights);
+
+        Net net = readNetFromDarknet(&cfg[0], cfg.size(), &weights[0], weights.size());
+        net.setInput(inp);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        Mat out = net.forward();
+        normAssert(ref, out);
+    }
+}
+
 class Test_Darknet_layers : public DNNTestLayer
 {
 public:
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index c61f7e378a..563ae993b6 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -104,8 +104,14 @@ TEST_P(Convolution, Accuracy)
     int backendId = get<0>(get<7>(GetParam()));
     int targetId = get<1>(get<7>(GetParam()));
 
-    if ((backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) ||
-        (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+
+    // TODO: unstable test cases
+    if (backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
+        inChannels == 6 && outChannels == 9 && group == 1 && inSize == Size(5, 6) &&
+        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1) && dilation == Size(1, 1) &&
+        hasBias)
         throw SkipTestException("");
 
     int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
@@ -353,8 +359,7 @@ TEST_P(FullyConnected, Accuracy)
     bool hasBias = get<3>(GetParam());
     int backendId = get<0>(get<4>(GetParam()));
     int targetId = get<1>(get<4>(GetParam()));
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE ||
-        (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         throw SkipTestException("");
 
     Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
@@ -692,10 +697,6 @@ TEST_P(Eltwise, Accuracy)
     int backendId = get<0>(get<4>(GetParam()));
     int targetId = get<1>(get<4>(GetParam()));
 
-    if (backendId == DNN_BACKEND_OPENCV &&
-        (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
-        throw SkipTestException("");
-
     Net net;
 
     std::vector<int> convLayerIds(numConv);
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index bd567ce72e..ca6645057b 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -763,8 +763,7 @@ TEST_P(Test_Caffe_layers, Average_pooling_kernel_area)
 // Test PriorBoxLayer in case of no aspect ratios (just squared proposals).
 TEST_P(Test_Caffe_layers, PriorBox_squares)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
-        (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)))
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
         throw SkipTestException("");
     LayerParams lp;
     lp.name = "testPriorBox";
@@ -791,7 +790,8 @@ TEST_P(Test_Caffe_layers, PriorBox_squares)
                                        0.25, 0.0, 1.0, 1.0,
                                        0.1f, 0.1f, 0.2f, 0.2f,
                                        0.1f, 0.1f, 0.2f, 0.2f);
-    normAssert(out.reshape(1, 4), ref);
+    double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-5 : 1e-5;
+    normAssert(out.reshape(1, 4), ref, "", l1);
 }
 
 typedef TestWithParam<tuple<int, int> > Layer_Test_DWconv_Prelu;
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 66b9d4f642..6ab0e41e18 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -243,10 +243,15 @@ TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
     runTensorFlowNet("l2_normalize_3d");
 }
 
-typedef testing::TestWithParam<Target> Test_TensorFlow_nets;
+class Test_TensorFlow_nets : public DNNTestLayer {};
 
 TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
 {
+    checkBackend();
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+
     std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
     std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
     std::string imgPath = findDataFile("dnn/street.png", false);
@@ -260,29 +265,30 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
     outNames[1] = "concat_1";
     outNames[2] = "detection_out";
 
-    std::vector<Mat> target(outNames.size());
+    std::vector<Mat> refs(outNames.size());
     for (int i = 0; i < outNames.size(); ++i)
     {
         std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
-        target[i] = blobFromNPY(path);
+        refs[i] = blobFromNPY(path);
     }
 
     Net net = readNetFromTensorflow(netPath, netConfig);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
 
     net.setInput(inp);
 
     std::vector<Mat> output;
     net.forward(output, outNames);
 
-    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
-    normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
-    normAssertDetections(target[2], output[2], "", 0.2);
+    normAssert(refs[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
+    normAssert(refs[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
+    normAssertDetections(refs[2], output[2], "", 0.2);
 }
 
 TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 {
+    checkBackend();
     std::string proto = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", false);
     std::string model = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false);
 
@@ -290,8 +296,8 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
     Mat img = imread(findDataFile("dnn/street.png", false));
     Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
 
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
 
     net.setInput(blob);
     // Output has shape 1x1xNx7 where N - number of detections.
@@ -302,16 +308,24 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
                                     0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
                                     0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
                                     0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
-    normAssertDetections(ref, out, "", 0.5);
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.025 : default_lInf;
+    normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
 }
 
 TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
 {
+    checkBackend();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+
     std::string proto = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
     std::string model = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false);
 
     Net net = readNetFromTensorflow(model, proto);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
     Mat img = imread(findDataFile("dnn/dog416.png", false));
     Mat blob = blobFromImage(img, 1.0f / 127.5, Size(800, 600), Scalar(127.5, 127.5, 127.5), true, false);
 
@@ -324,6 +338,11 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
 
 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 {
+    checkBackend();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
+        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD))
+        throw SkipTestException("");
+
     std::string proto = findDataFile("dnn/opencv_face_detector.pbtxt", false);
     std::string model = findDataFile("dnn/opencv_face_detector_uint8.pb", false);
 
@@ -331,9 +350,8 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
     Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
     Mat blob = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
 
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
-
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
     net.setInput(blob);
     // Output has shape 1x1xNx7 where N - number of detections.
     // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
@@ -346,7 +364,9 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
                                     0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
                                     0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
                                     0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssertDetections(ref, out, "", 0.9, 3.4e-3, 1e-2);
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 4e-3 : 3.4e-3;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.017 : 1e-2;
+    normAssertDetections(ref, out, "", 0.9, scoreDiff, iouDiff);
 }
 
 // inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
@@ -360,6 +380,10 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 // np.save('east_text_detection.geometry.npy', geometry)
 TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 {
+    checkBackend();
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+
     std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
     std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
     std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
@@ -367,7 +391,8 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 
     Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
 
-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
 
     Mat img = imread(imgPath);
     Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
@@ -386,7 +411,7 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
     normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 1e-4, 3e-3);
 }
 
-INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, dnnBackendsAndTargets());
 
 TEST_P(Test_TensorFlow_layers, fp16_weights)
 {
diff --git a/modules/features2d/src/bagofwords.cpp b/modules/features2d/src/bagofwords.cpp
index 31fb19d837..65eef9a0ef 100644
--- a/modules/features2d/src/bagofwords.cpp
+++ b/modules/features2d/src/bagofwords.cpp
@@ -177,6 +177,7 @@ void BOWImgDescriptorExtractor::compute( InputArray keypointDescriptors, OutputA
     CV_INSTRUMENT_REGION()
 
     CV_Assert( !vocabulary.empty() );
+    CV_Assert(!keypointDescriptors.empty());
 
     int clusterCount = descriptorSize(); // = vocabulary.rows
 
diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp
index ffcc431b58..b8a2e95977 100644
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@@ -264,6 +264,8 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
             convexHull(Mat(contours[contourIdx]), hull);
             double area = contourArea(Mat(contours[contourIdx]));
             double hullArea = contourArea(Mat(hull));
+            if (fabs(hullArea) < DBL_EPSILON)
+                continue;
             double ratio = area / hullArea;
             if (ratio < params.minConvexity || ratio >= params.maxConvexity)
                 continue;
@@ -309,6 +311,7 @@ void SimpleBlobDetectorImpl::detect(InputArray image, std::vector<cv::KeyPoint>&
     CV_INSTRUMENT_REGION()
 
     keypoints.clear();
+    CV_Assert(params.minRepeatability != 0);
     Mat grayscaleImage;
     if (image.channels() == 3 || image.channels() == 4)
         cvtColor(image, grayscaleImage, COLOR_BGR2GRAY);
diff --git a/modules/features2d/src/brisk.cpp b/modules/features2d/src/brisk.cpp
index bacea2b5a4..5e233d0d8f 100644
--- a/modules/features2d/src/brisk.cpp
+++ b/modules/features2d/src/brisk.cpp
@@ -506,6 +506,7 @@ BRISK_Impl::smoothedIntensity(const cv::Mat& image, const cv::Mat& integral, con
   // scaling:
   const int scaling = (int)(4194304.0 / area);
   const int scaling2 = int(float(scaling) * area / 1024.0);
+  CV_Assert(scaling2 != 0);
 
   // the integral image is larger:
   const int integralcols = imagecols + 1;
@@ -2238,6 +2239,7 @@ BriskLayer::value(const cv::Mat& mat, float xf, float yf, float scale_in) const
   // scaling:
   const int scaling = (int)(4194304.0f / area);
   const int scaling2 = (int)(float(scaling) * area / 1024.0f);
+  CV_Assert(scaling2 != 0);
 
   // calculate borders
   const float x_1 = xf - sigma_half;
diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp
index 728a7bba70..b51e9ae75c 100644
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@@ -271,7 +271,7 @@ void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
     }
 }
 
-PERF_TEST(Transform, getPerspectiveTransform)
+PERF_TEST(Transform, getPerspectiveTransform_1000)
 {
     unsigned int size = 8;
     Mat source(1, size/2, CV_32FC2);
@@ -280,12 +280,14 @@ PERF_TEST(Transform, getPerspectiveTransform)
 
     declare.in(source, destination, WARMUP_RNG);
 
-    TEST_CYCLE()
+    PERF_SAMPLE_BEGIN()
+    for (int i = 0; i < 1000; i++)
     {
         transformCoefficient = getPerspectiveTransform(source, destination);
     }
+    PERF_SAMPLE_END()
 
-    SANITY_CHECK(transformCoefficient, 1e-5);
+    SANITY_CHECK_NOTHING();
 }
 
 } // namespace
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index 74f5e6201a..10f7744c1f 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -209,7 +209,14 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
         CV_Error( CV_StsBadSize, "" );
 
     CvContourScanner scanner = (CvContourScanner)cvAlloc( sizeof( *scanner ));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
     memset( scanner, 0, sizeof(*scanner) );
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 
     scanner->storage1 = scanner->storage2 = storage;
     scanner->img0 = (schar *) img;
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index c601ceaf33..999ce959c9 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -546,10 +546,10 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
     size_t lmsz = dev.localMemSize();
     size_t src_step = _src.step(), src_offset = _src.offset();
     const size_t tileSizeYmax = wgs / tileSizeX;
+    CV_Assert(src_step != 0 && esz != 0);
 
     // workaround for NVIDIA: 3 channel vector type takes 4*elem_size in local memory
     int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
-
     if (((src_offset % src_step) % esz == 0) &&
         (
          (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 99fd6a9dcd..98eeee7f9f 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -2563,6 +2563,11 @@ static const int CodeDeltas[8][2] =
 #define CV_ADJUST_EDGE_COUNT( count, seq )  \
     ((count) -= ((count) == (seq)->total && !CV_IS_SEQ_CLOSED(seq)))
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 CV_IMPL void
 cvDrawContours( void* _img, CvSeq* contour,
                 CvScalar _externalColor, CvScalar _holeColor,
@@ -2894,4 +2899,8 @@ cvGetTextSize( const char *text, const CvFont *_font, CvSize *_size, int *_base_
         *_size = size;
 }
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop // "-Wclass-memaccess"
+#endif
+
 /* End of file. */
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 23c560736a..a0866156a1 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -4284,10 +4284,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
     size_t src_step = _src.step(), src_offset = _src.offset();
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
-            !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
-              borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
-              borderType == BORDER_REFLECT_101))
+    if (esz == 0
+        || (src_offset % src_step) % esz != 0
+        || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        || !(borderType == BORDER_CONSTANT
+             || borderType == BORDER_REPLICATE
+             || borderType == BORDER_REFLECT
+             || borderType == BORDER_WRAP
+             || borderType == BORDER_REFLECT_101))
         return false;
 
     size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp
index 0df59d3a8a..953b4bba9e 100644
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@@ -642,8 +642,15 @@ cvFloodFill( CvArr* arr, CvPoint seed_point,
              CvScalar newVal, CvScalar lo_diff, CvScalar up_diff,
              CvConnectedComp* comp, int flags, CvArr* maskarr )
 {
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
     if( comp )
         memset( comp, 0, sizeof(*comp) );
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 
     cv::Mat img = cv::cvarrToMat(arr), mask = cv::cvarrToMat(maskarr);
     int area = cv::floodFill(img, mask, seed_point, newVal,
diff --git a/modules/imgproc/src/grabcut.cpp b/modules/imgproc/src/grabcut.cpp
index 6edce405cb..ff3c601548 100644
--- a/modules/imgproc/src/grabcut.cpp
+++ b/modules/imgproc/src/grabcut.cpp
@@ -174,6 +174,7 @@ void GMM::addSample( int ci, const Vec3d color )
 
 void GMM::endLearning()
 {
+    CV_Assert(totalSampleCount > 0);
     const double variance = 0.01;
     for( int ci = 0; ci < componentsCount; ci++ )
     {
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 9e069bc84d..ad090fd247 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -50,6 +50,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "hal_replacement.hpp"
+#include <opencv2/core/utils/configuration.private.hpp>
 #include "opencv2/core/hal/intrin.hpp"
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #include "opencv2/core/softfloat.hpp"
@@ -3061,7 +3062,9 @@ cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
         b[i+4] = dst[i].y;
     }
 
-    solve( A, B, X, DECOMP_SVD );
+    static int param_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD =
+        (int)utils::getConfigurationParameterSizeT("OPENCV_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD", (size_t)DECOMP_LU);
+    solve(A, B, X, param_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD);
     M.ptr<double>()[8] = 1.;
 
     return M;
@@ -3283,6 +3286,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
 
     if (!(flags & CV_WARP_INVERSE_MAP))
     {
+        CV_Assert(!dsize.empty());
         double Kangle = CV_2PI / dsize.height;
         int phi, rho;
 
@@ -3329,6 +3333,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
         Mat src = _dst.getMat();
         Size ssize = _dst.size();
         ssize.height -= 2 * ANGLE_BORDER;
+        CV_Assert(!ssize.empty());
         const double Kangle = CV_2PI / ssize.height;
         double Kmag;
         if (semiLog)
diff --git a/modules/imgproc/src/linefit.cpp b/modules/imgproc/src/linefit.cpp
index 246d693586..103fa55950 100644
--- a/modules/imgproc/src/linefit.cpp
+++ b/modules/imgproc/src/linefit.cpp
@@ -47,6 +47,7 @@ static const double eps = 1e-6;
 
 static void fitLine2D_wods( const Point2f* points, int count, float *weights, float *line )
 {
+    CV_Assert(count > 0);
     double x = 0, y = 0, x2 = 0, y2 = 0, xy = 0, w = 0;
     double dx2, dy2, dxy;
     int i;
@@ -98,6 +99,7 @@ static void fitLine2D_wods( const Point2f* points, int count, float *weights, fl
 
 static void fitLine3D_wods( const Point3f * points, int count, float *weights, float *line )
 {
+    CV_Assert(count > 0);
     int i;
     float w0 = 0;
     float x0 = 0, y0 = 0, z0 = 0;
diff --git a/modules/imgproc/src/lsd.cpp b/modules/imgproc/src/lsd.cpp
index d73787407e..370d76955d 100644
--- a/modules/imgproc/src/lsd.cpp
+++ b/modules/imgproc/src/lsd.cpp
@@ -772,6 +772,7 @@ bool LineSegmentDetectorImpl::refine(std::vector<RegionPoint>& reg, double reg_a
             ++n;
         }
     }
+    CV_Assert(n > 0);
     double mean_angle = sum / double(n);
     // 2 * standard deviation
     double tau = 2.0 * sqrt((s_sum - 2.0 * mean_angle * sum) / double(n) + mean_angle * mean_angle);
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index 7e52f1f2fa..fc986aff0b 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -495,6 +495,13 @@ static bool ocl_moments( InputArray _src, Moments& m, bool binary)
     const int TILE_SIZE = 32;
     const int K = 10;
 
+    Size sz = _src.getSz();
+    int xtiles = divUp(sz.width, TILE_SIZE);
+    int ytiles = divUp(sz.height, TILE_SIZE);
+    int ntiles = xtiles*ytiles;
+    if (ntiles == 0)
+        return false;
+
     ocl::Kernel k = ocl::Kernel("moments", ocl::imgproc::moments_oclsrc,
         format("-D TILE_SIZE=%d%s",
         TILE_SIZE,
@@ -504,10 +511,6 @@ static bool ocl_moments( InputArray _src, Moments& m, bool binary)
         return false;
 
     UMat src = _src.getUMat();
-    Size sz = src.size();
-    int xtiles = (sz.width + TILE_SIZE-1)/TILE_SIZE;
-    int ytiles = (sz.height + TILE_SIZE-1)/TILE_SIZE;
-    int ntiles = xtiles*ytiles;
     UMat umbuf(1, ntiles*K, CV_32S);
 
     size_t globalsize[] = {(size_t)xtiles, std::max((size_t)TILE_SIZE, (size_t)sz.height)};
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 2c032ada32..f4077521e1 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1709,6 +1709,7 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
 
 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
 {
+    CV_Assert(n > 0);
     const int SMALL_GAUSSIAN_SIZE = 7;
     static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
     {
@@ -1747,6 +1748,7 @@ cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
         }
     }
 
+    CV_DbgAssert(fabs(sum) > 0);
     sum = 1./sum;
     for( i = 0; i < n; i++ )
     {
@@ -5329,6 +5331,7 @@ public:
                         wsum += w;
                     }
                     // overflow is not possible here => there is no need to use cv::saturate_cast
+                    CV_DbgAssert(fabs(wsum) > 0);
                     dptr[j] = (uchar)cvRound(sum/wsum);
                 }
             }
@@ -5414,6 +5417,7 @@ public:
                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
                         wsum += w;
                     }
+                    CV_DbgAssert(fabs(wsum) > 0);
                     wsum = 1.f/wsum;
                     b0 = cvRound(sum_b*wsum);
                     g0 = cvRound(sum_g*wsum);
@@ -5673,6 +5677,7 @@ public:
                         sum += val*w;
                         wsum += w;
                     }
+                    CV_DbgAssert(fabs(wsum) > 0);
                     dptr[j] = (float)(sum/wsum);
                 }
             }
@@ -5763,6 +5768,7 @@ public:
                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
                         wsum += w;
                     }
+                    CV_DbgAssert(fabs(wsum) > 0);
                     wsum = 1.f/wsum;
                     b0 = sum_b*wsum;
                     g0 = sum_g*wsum;
diff --git a/modules/imgproc/test/test_grabcut.cpp b/modules/imgproc/test/test_grabcut.cpp
index eae8b3e482..7bf6555a0c 100644
--- a/modules/imgproc/test/test_grabcut.cpp
+++ b/modules/imgproc/test/test_grabcut.cpp
@@ -89,7 +89,6 @@ void CV_GrabcutTest::run( int /* start_from */)
     Mat exp_bgdModel, exp_fgdModel;
 
     Mat mask;
-    mask = Scalar(0);
     Mat bgdModel, fgdModel;
     grabCut( img, mask, rect, bgdModel, fgdModel, 0, GC_INIT_WITH_RECT );
     bgdModel.copyTo(exp_bgdModel);
diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp
index 8a40eb4ca7..bb0716cca3 100644
--- a/modules/imgproc/test/test_moments.cpp
+++ b/modules/imgproc/test/test_moments.cpp
@@ -186,7 +186,14 @@ void CV_MomentsTest::prepare_to_validation( int /*test_case_idx*/ )
     int i, y, x, cols = src.cols;
     double xc = 0., yc = 0.;
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
     memset( &m, 0, sizeof(m));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 
     int coi = 0;
     for( y = 0; y < src.rows; y++ )
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index 5de2e34890..d3a4664d38 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -67,6 +67,10 @@ type_dict = {
     "double[]": { "j_type" : "double[]", "jn_type" : "double[]", "jni_type" : "jdoubleArray", "suffix" : "_3D" }
 }
 
+# Defines a rule to add extra prefixes for names from specific namespaces.
+# In example, cv::fisheye::stereoRectify from namespace fisheye is wrapped as fisheye_stereoRectify
+namespaces_dict = {}
+
 # { class : { func : {j_code, jn_code, cpp_code} } }
 ManualFuncs = {}
 
@@ -148,6 +152,8 @@ class ConstInfo(GeneralInfo):
         self.cname = self.name.replace(".", "::")
         self.value = decl[1]
         self.addedManually = addedManually
+        if self.namespace in namespaces_dict:
+            self.name = '%s_%s' % (namespaces_dict[self.namespace], self.name)
 
     def __repr__(self):
         return Template("CONST $name=$value$manual").substitute(name=self.name,
@@ -297,11 +303,13 @@ class ArgInfo():
 class FuncInfo(GeneralInfo):
     def __init__(self, decl, namespaces=[]): # [ funcname, return_ctype, [modifiers], [args] ]
         GeneralInfo.__init__(self, "func", decl, namespaces)
-        self.cname = self.name.replace(".", "::")
+        self.cname = decl[0].replace(".", "::")
         self.jname = self.name
         self.isconstructor = self.name == self.classname
         if "[" in self.name:
             self.jname = "getelem"
+        if self.namespace in namespaces_dict:
+            self.jname = '%s_%s' % (namespaces_dict[self.namespace], self.jname)
         for m in decl[2]:
             if m.startswith("="):
                 self.jname = m[1:]
@@ -688,9 +696,9 @@ class JavaWrapperGenerator(object):
             # java part:
 
             #java doc comment
-            f_name = fi.name
+            f_name = fi.jname
             if fi.classname:
-                f_name = fi.classname + "::" + fi.name
+                f_name = fi.classname + "::" + fi.jname
             java_doc = "//javadoc: " + f_name + "(%s)" % ", ".join([a.name for a in args if a.ctype])
             j_code.write(" "*4 + java_doc + "\n")
 
@@ -897,13 +905,10 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
             j_signatures.append(j_signature)
 
             # processing args with default values
-            if not args or not args[-1].defval:
+            if args and args[-1].defval:
+                args.pop()
+            else:
                 break
-            while args and args[-1].defval:
-                # 'smart' overloads filtering
-                a = args.pop()
-                if a.name in ('mask', 'dtype', 'ddepth', 'lineType', 'borderType', 'borderMode', 'criteria'):
-                    break
 
 
 
@@ -1146,6 +1151,7 @@ if __name__ == "__main__":
             type_dict.update(gen_type_dict.get("type_dict", {}))
             ManualFuncs.update(gen_type_dict.get("ManualFuncs", {}))
             func_arg_fix.update(gen_type_dict.get("func_arg_fix", {}))
+            namespaces_dict.update(gen_type_dict.get("namespaces_dict", {}))
             if 'module_j_code' in gen_type_dict:
                 module_j_code = read_contents(checkFileRemap(os.path.join(misc_location, gen_type_dict['module_j_code'])))
             if 'module_jn_code' in gen_type_dict:
diff --git a/modules/ml/src/em.cpp b/modules/ml/src/em.cpp
index 8a1020dbd9..c2dfc9c523 100644
--- a/modules/ml/src/em.cpp
+++ b/modules/ml/src/em.cpp
@@ -616,6 +616,7 @@ public:
             expDiffSum += v; // sum_j(exp(L_ij - L_iq))
         }
 
+        CV_Assert(expDiffSum > 0);
         if(probs)
             L.convertTo(*probs, ptype, 1./expDiffSum);
 
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index cc5253e57c..b7e32b92b0 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -170,6 +170,7 @@ public:
                 double val = std::abs(w->ord_responses[w->sidx[i]]);
                 max_response = std::max(max_response, val);
             }
+            CV_Assert(fabs(max_response) > 0);
         }
 
         if( rparams.calcVarImportance )
diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp
index da76a81f87..2f9dc049e1 100644
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@@ -630,7 +630,7 @@ void DTreesImpl::calcValue( int nidx, const vector<int>& _sidx )
                 w->cv_Tn[nidx*cv_n + j] = INT_MAX;
             }
         }
-
+        CV_Assert(fabs(sumw) > 0);
         node->node_risk = sum2 - (sum/sumw)*sum;
         node->node_risk /= sumw;
         node->value = sum/sumw;
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index f0136b0f1c..2f786ebee9 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -670,6 +670,21 @@ public:
     void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
 };
 
+class CV_EXPORTS QRCodeDetector
+{
+public:
+    QRCodeDetector();
+    ~QRCodeDetector();
+
+    void setEpsX(double epsX);
+    void setEpsY(double epsY);
+
+    bool detect(InputArray in, OutputArray points) const;
+protected:
+    struct Impl;
+    Ptr<Impl> p;
+};
+
 /** @brief Detect QR code in image and return minimum area of quadrangle that describes QR code.
     @param in  Matrix of the type CV_8UC1 containing an image where QR code are detected.
     @param points Output vector of vertices of a quadrangle of minimal area that describes QR code.
diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp
index 33110116b8..4d45d28d10 100644
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@@ -67,6 +67,11 @@
 #  endif
 #endif
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
 #define CV_ADJUST_WEIGHTS  0
@@ -599,7 +604,7 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
                     else
                         sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
                 }
-
+                CV_Assert(area0 > 0);
                 hidfeature->rect[0].weight = (float)(-sum0/area0);
             } /* l */
         } /* j */
@@ -2290,4 +2295,8 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier,
                   icvReadHaarClassifier, icvWriteHaarClassifier,
                   icvCloneHaarClassifier );
 
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
+
 /* End of file. */
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index e3c8d559ae..c3a5593d35 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -15,45 +15,52 @@
 
 namespace cv
 {
+using std::vector;
+
 class QRDecode
 {
- public:
-    void init(Mat src, double eps_vertical_ = 0.19, double eps_horizontal_ = 0.09);
+public:
+    void init(Mat src, double eps_vertical_ = 0.2, double eps_horizontal_ = 0.1);
     void binarization();
     bool localization();
     bool transformation();
     Mat getBinBarcode() { return bin_barcode; }
-    Mat getLocalizationBarcode() { return local_barcode; }
-    Mat getTransformationBarcode() { return transform_barcode; }
-    std::vector<Point> getTransformationPoints() { return transformation_points; }
     Mat getStraightBarcode() { return straight_barcode; }
- protected:
-    std::vector<Vec3d> searchVerticalLines();
-    std::vector<Vec3d> separateHorizontalLines(std::vector<Vec3d> list_lines);
-    std::vector<Vec3d> pointClustering(std::vector<Vec3d> list_lines);
-    void fixationPoints(std::vector<Point> &local_point, std::vector<double> &local_len);
-    Point getTransformationPoint(Point left, Point center, double cos_angle_rotation,
-                                 bool right_rotate = true);
-    Point intersectionLines(Point a1, Point a2, Point b1, Point b2);
-    std::vector<Point> getQuadrilateral(std::vector<Point> angle_list);
-    double getQuadrilateralArea(Point a, Point b, Point c, Point d);
-    double getCosVectors(Point a, Point b, Point c);
-
-    Mat barcode, bin_barcode, local_barcode, transform_barcode, straight_barcode;
-    std::vector<Point>  localization_points, transformation_points;
-    std::vector<double> localization_length;
-    double experimental_area;
-
-    double eps_vertical, eps_horizontal;
-    std::vector<Vec3d> result;
-    std::vector<double> test_lines;
-    uint8_t next_pixel, future_pixel;
-    double length, weight;
+    vector<Point2f> getTransformationPoints() { return transformation_points; }
+protected:
+    bool computeTransformationPoints();
+    vector<Vec3d> searchVerticalLines();
+    vector<Point2f> separateHorizontalLines(vector<Vec3d> list_lines);
+    void fixationPoints(vector<Point2f> &local_point);
+    Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2);
+    vector<Point2f> getQuadrilateral(vector<Point2f> angle_list);
+    bool testBypassRoute(vector<Point2f> hull, int start, int finish);
+    double getTriangleArea(Point2f a, Point2f b, Point2f c);
+    double getPolygonArea(vector<Point2f> points);
+    double getCosVectors(Point2f a, Point2f b, Point2f c);
+
+    Mat barcode, bin_barcode, straight_barcode;
+    vector<Point2f> localization_points, transformation_points;
+    double eps_vertical, eps_horizontal, coeff_expansion;
 };
 
+
 void QRDecode::init(Mat src, double eps_vertical_, double eps_horizontal_)
 {
-    barcode = src;
+    double min_side = std::min(src.size().width, src.size().height);
+    if (min_side < 512.0)
+    {
+        coeff_expansion = 512.0 / min_side;
+        int width  = static_cast<int>(src.size().width  * coeff_expansion);
+        int height = static_cast<int>(src.size().height  * coeff_expansion);
+        Size new_size(width, height);
+        resize(src, barcode, new_size);
+    }
+    else
+    {
+        coeff_expansion = 1.0;
+        barcode = src;
+    }
     eps_vertical   = eps_vertical_;
     eps_horizontal = eps_horizontal_;
 }
@@ -65,37 +72,12 @@ void QRDecode::binarization()
     threshold(filter_barcode, bin_barcode, 0, 255, THRESH_BINARY + THRESH_OTSU);
 }
 
-bool QRDecode::localization()
+vector<Vec3d> QRDecode::searchVerticalLines()
 {
-    cvtColor(bin_barcode, local_barcode, COLOR_GRAY2RGB);
-    Point begin, end;
-
-    std::vector<Vec3d> list_lines_x = searchVerticalLines();
-    if (list_lines_x.empty()) return false;
-    std::vector<Vec3d> list_lines_y = separateHorizontalLines(list_lines_x);
-    if (list_lines_y.empty()) return false;
-    std::vector<Vec3d> result_point = pointClustering(list_lines_y);
-    if (result_point.empty()) return false;
-    for (int i = 0; i < 3; i++)
-    {
-        localization_points.push_back(
-            Point(static_cast<int>(result_point[i][0]),
-                  static_cast<int>(result_point[i][1] + result_point[i][2])));
-        localization_length.push_back(result_point[i][2]);
-    }
-
-    fixationPoints(localization_points, localization_length);
-
-
-    if (localization_points.size() != 3) { return false; }
-    return true;
-
-}
-
-std::vector<Vec3d> QRDecode::searchVerticalLines()
-{
-    result.clear();
+    vector<Vec3d> result;
     int temp_length = 0;
+    uint8_t next_pixel, future_pixel;
+    vector<double> test_lines;
 
     for (int x = 0; x < bin_barcode.rows; x++)
     {
@@ -125,14 +107,15 @@ std::vector<Vec3d> QRDecode::searchVerticalLines()
 
             if (test_lines.size() == 5)
             {
-                length = 0.0; weight = 0.0;
+                double length = 0.0, weight = 0.0;
 
                 for (size_t i = 0; i < test_lines.size(); i++) { length += test_lines[i]; }
 
+                CV_Assert(length > 0);
                 for (size_t i = 0; i < test_lines.size(); i++)
                 {
-                    if (i == 2) { weight += abs((test_lines[i] / length) - 3.0/7.0); }
-                    else        { weight += abs((test_lines[i] / length) - 1.0/7.0); }
+                    if (i == 2) { weight += fabs((test_lines[i] / length) - 3.0/7.0); }
+                    else        { weight += fabs((test_lines[i] / length) - 1.0/7.0); }
                 }
 
                 if (weight < eps_vertical)
@@ -147,16 +130,17 @@ std::vector<Vec3d> QRDecode::searchVerticalLines()
     return result;
 }
 
-std::vector<Vec3d> QRDecode::separateHorizontalLines(std::vector<Vec3d> list_lines)
+vector<Point2f> QRDecode::separateHorizontalLines(vector<Vec3d> list_lines)
 {
-    result.clear();
+    vector<Vec3d> result;
     int temp_length = 0;
-    int x, y;
+    uint8_t next_pixel, future_pixel;
+    vector<double> test_lines;
 
     for (size_t pnt = 0; pnt < list_lines.size(); pnt++)
     {
-        x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] / 2);
-        y = static_cast<int>(list_lines[pnt][1]);
+        int x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] / 2);
+        int y = static_cast<int>(list_lines[pnt][1]);
 
         // --------------- Search horizontal up-lines --------------- //
         test_lines.clear();
@@ -195,92 +179,36 @@ std::vector<Vec3d> QRDecode::separateHorizontalLines(std::vector<Vec3d> list_lin
 
         if (test_lines.size() == 6)
         {
-            length = 0.0; weight = 0.0;
+            double length = 0.0, weight = 0.0;
 
             for (size_t i = 0; i < test_lines.size(); i++) { length += test_lines[i]; }
 
+            CV_Assert(length > 0);
             for (size_t i = 0; i < test_lines.size(); i++)
             {
-                if (i % 3 == 0) { weight += abs((test_lines[i] / length) - 3.0/14.0); }
-                else            { weight += abs((test_lines[i] / length) - 1.0/ 7.0); }
+                if (i % 3 == 0) { weight += fabs((test_lines[i] / length) - 3.0/14.0); }
+                else            { weight += fabs((test_lines[i] / length) - 1.0/ 7.0); }
             }
-        }
-
-        if(weight < eps_horizontal)
-        {
-            result.push_back(list_lines[pnt]);
-        }
-    }
-    return result;
-}
-
-std::vector<Vec3d> QRDecode::pointClustering(std::vector<Vec3d> list_lines)
-{
-    std::vector<Vec3d> centers;
-    std::vector<Point> clusters[3];
-    double weight_clusters[3] = {0.0, 0.0, 0.0};
-    Point basis[3], temp_pnt;
-    double temp_norm = 0.0, temp_compute_norm, distance[3];
-
-    basis[0] = Point(static_cast<int>(list_lines[0][1]), static_cast<int>(list_lines[0][0]));
-    for (size_t i = 1; i < list_lines.size(); i++)
-    {
-        temp_pnt = Point(static_cast<int>(list_lines[i][1]), static_cast<int>(list_lines[i][0]));
-        temp_compute_norm = norm(basis[0] - temp_pnt);
-        if (temp_norm < temp_compute_norm)
-        {
-            basis[1] = temp_pnt;
-            temp_norm = temp_compute_norm;
-        }
-    }
 
-    for (size_t i = 1; i < list_lines.size(); i++)
-    {
-        temp_pnt = Point(static_cast<int>(list_lines[i][1]), static_cast<int>(list_lines[i][0]));
-        temp_compute_norm = norm(basis[0] - temp_pnt) + norm(basis[1] - temp_pnt);
-        if (temp_norm < temp_compute_norm)
-        {
-            basis[2] = temp_pnt;
-            temp_norm = temp_compute_norm;
-        }
-    }
+            if(weight < eps_horizontal)
+            {
+                result.push_back(list_lines[pnt]);
+            }
 
-    for (size_t i = 0; i < list_lines.size(); i++)
-    {
-        temp_pnt = Point(static_cast<int>(list_lines[i][1]), static_cast<int>(list_lines[i][0]));
-        distance[0] = norm(basis[0] - temp_pnt);
-        distance[1] = norm(basis[1] - temp_pnt);
-        distance[2] = norm(basis[2] - temp_pnt);
-        if (distance[0] < distance[1] && distance[0] < distance[2])
-        {
-            clusters[0].push_back(temp_pnt);
-            weight_clusters[0] += list_lines[i][2];
-        }
-        else if (distance[1] < distance[0] && distance[1] < distance[2])
-        {
-            clusters[1].push_back(temp_pnt);
-            weight_clusters[1] += list_lines[i][2];
-        }
-        else
-        {
-            clusters[2].push_back(temp_pnt);
-            weight_clusters[2] += list_lines[i][2];
         }
     }
 
-    for (int i = 0; i < 3; i++)
+    vector<Point2f> point2f_result;
+    for (size_t i = 0; i < result.size(); i++)
     {
-        basis[i] = Point(0, 0);
-        for (size_t j = 0; j < clusters[i].size(); j++) { basis[i] += clusters[i][j]; }
-        basis[i] = basis[i] / static_cast<int>(clusters[i].size());
-        weight = weight_clusters[i] / (2 * clusters[i].size());
-        centers.push_back(Vec3d(basis[i].x, basis[i].y, weight));
+        point2f_result.push_back(
+              Point2f(static_cast<float>(result[i][1]),
+                      static_cast<float>(result[i][0] + result[i][2] / 2)));
     }
-
-    return centers;
+    return point2f_result;
 }
 
-void QRDecode::fixationPoints(std::vector<Point> &local_point, std::vector<double> &local_len)
+void QRDecode::fixationPoints(vector<Point2f> &local_point)
 {
     double cos_angles[3], norm_triangl[3];
 
@@ -289,182 +217,236 @@ void QRDecode::fixationPoints(std::vector<Point> &local_point, std::vector<doubl
     norm_triangl[2] = norm(local_point[1] - local_point[0]);
 
     cos_angles[0] = (pow(norm_triangl[1], 2) + pow(norm_triangl[2], 2) - pow(norm_triangl[0], 2))
-                  / (2 * norm_triangl[1] * norm_triangl[2]);
+    / (2 * norm_triangl[1] * norm_triangl[2]);
     cos_angles[1] = (pow(norm_triangl[0], 2) + pow(norm_triangl[2], 2) - pow(norm_triangl[1], 2))
-                  / (2 * norm_triangl[0] * norm_triangl[2]);
+    / (2 * norm_triangl[0] * norm_triangl[2]);
     cos_angles[2] = (pow(norm_triangl[0], 2) + pow(norm_triangl[1], 2) - pow(norm_triangl[2], 2))
-                  / (2 * norm_triangl[0] * norm_triangl[1]);
-
-    int i_min_cos =
-      (cos_angles[0] < cos_angles[1] && cos_angles[0] < cos_angles[2]) ? 0 :
-      (cos_angles[1] < cos_angles[0] && cos_angles[1] < cos_angles[2]) ? 1 : 2;
-
-    Point temp_pnt;
-    double tmp_len;
-    temp_pnt = local_point[0];
-    tmp_len = local_len[0];
-    local_point[0] = local_point[i_min_cos];
-    local_len[0] = local_len[i_min_cos];
-    local_point[i_min_cos] = temp_pnt;
-    local_len[i_min_cos] = tmp_len;
-
-    Mat vector_mult(Size(3, 3), CV_32FC1);
-    vector_mult.at<float>(0, 0) = 1;
-    vector_mult.at<float>(1, 0) = 1;
-    vector_mult.at<float>(2, 0) = 1;
-    vector_mult.at<float>(0, 1) = static_cast<float>((local_point[1] - local_point[0]).x);
-    vector_mult.at<float>(1, 1) = static_cast<float>((local_point[1] - local_point[0]).y);
-    vector_mult.at<float>(0, 2) = static_cast<float>((local_point[2] - local_point[0]).x);
-    vector_mult.at<float>(1, 2) = static_cast<float>((local_point[2] - local_point[0]).y);
-    double res_vect_mult = determinant(vector_mult);
-    if (res_vect_mult < 0)
+    / (2 * norm_triangl[0] * norm_triangl[1]);
+
+    size_t i_min_cos =
+    (cos_angles[0] < cos_angles[1] && cos_angles[0] < cos_angles[2]) ? 0 :
+    (cos_angles[1] < cos_angles[0] && cos_angles[1] < cos_angles[2]) ? 1 : 2;
+
+    std::swap(local_point[0], local_point[i_min_cos]);
+
+    Point2f rpt = local_point[0], bpt = local_point[1], gpt = local_point[2];
+    Matx22f m(rpt.x - bpt.x, rpt.y - bpt.y, gpt.x - rpt.x, gpt.y - rpt.y);
+    if( determinant(m) > 0 )
     {
-        temp_pnt = local_point[1];
-        tmp_len = local_len[1];
-        local_point[1] = local_point[2];
-        local_len[1] = local_len[2];
-        local_point[2] = temp_pnt;
-        local_len[2] = tmp_len;
+        std::swap(local_point[1], local_point[2]);
     }
 }
 
-bool QRDecode::transformation()
+bool QRDecode::localization()
 {
-    cvtColor(bin_barcode, transform_barcode, COLOR_GRAY2RGB);
+    Point2f begin, end;
+    vector<Vec3d> list_lines_x = searchVerticalLines();
+    if( list_lines_x.empty() ) { return false; }
+    vector<Point2f> list_lines_y = separateHorizontalLines(list_lines_x);
+    if( list_lines_y.empty() ) { return false; }
+
+    vector<Point2f> centers;
+    Mat labels;
+    if (list_lines_y.size() < 3) { return false; }
+    kmeans(list_lines_y, 3, labels,
+           TermCriteria( TermCriteria::EPS+TermCriteria::COUNT, 10, 1.0),
+           3, KMEANS_PP_CENTERS, localization_points);
+
+    fixationPoints(localization_points);
     if (localization_points.size() != 3) { return false; }
 
-    Point red   = localization_points[0];
-    Point green = localization_points[1];
-    Point blue  = localization_points[2];
-    Point adj_b_r_pnt,  adj_r_b_pnt, adj_g_r_pnt, adj_r_g_pnt;
-    Point line_r_b_pnt, line_r_g_pnt, norm_r_b_pnt, norm_r_g_pnt;
-    adj_b_r_pnt  = getTransformationPoint(blue, red, -1);
-    adj_r_b_pnt  = getTransformationPoint(red, blue, -1);
-    adj_g_r_pnt  = getTransformationPoint(green, red, -1);
-    adj_r_g_pnt  = getTransformationPoint(red, green, -1);
-    line_r_b_pnt = getTransformationPoint(red, blue,  -0.91);
-    line_r_g_pnt = getTransformationPoint(red, green, -0.91);
-    norm_r_b_pnt = getTransformationPoint(red, blue,  0.0, true);
-    norm_r_g_pnt = getTransformationPoint(red, green, 0.0, false);
-
-    transformation_points.push_back(intersectionLines(
-        adj_r_g_pnt, line_r_g_pnt, adj_r_b_pnt, line_r_b_pnt));
-    transformation_points.push_back(intersectionLines(
-        adj_b_r_pnt, norm_r_g_pnt, adj_r_g_pnt, line_r_g_pnt));
-    transformation_points.push_back(intersectionLines(
-        norm_r_b_pnt, adj_g_r_pnt, adj_b_r_pnt, norm_r_g_pnt));
-    transformation_points.push_back(intersectionLines(
-        norm_r_b_pnt, adj_g_r_pnt, adj_r_b_pnt, line_r_b_pnt));
-
-    experimental_area = getQuadrilateralArea(transformation_points[0],
-                                             transformation_points[1],
-                                             transformation_points[2],
-                                             transformation_points[3]);
-    std::vector<Point> quadrilateral = getQuadrilateral(transformation_points);
-    transformation_points = quadrilateral;
-
-    int max_length_norm = -1;
-    size_t transform_size = transformation_points.size();
-    for (size_t i = 0; i < transform_size; i++)
+    if (coeff_expansion > 1.0)
     {
-        int len_norm = static_cast<int>(norm(transformation_points[i % transform_size] -
-                                             transformation_points[(i + 1) % transform_size]));
-        if (max_length_norm < len_norm) { max_length_norm = len_norm; }
+        int width  = static_cast<int>(bin_barcode.size().width  / coeff_expansion);
+        int height = static_cast<int>(bin_barcode.size().height / coeff_expansion);
+        Size new_size(width, height);
+        Mat intermediate;
+        resize(bin_barcode, intermediate, new_size);
+        bin_barcode = intermediate.clone();
+        for (size_t i = 0; i < localization_points.size(); i++)
+        {
+            localization_points[i] /= coeff_expansion;
+        }
     }
 
-    std::vector<Point> perspective_points;
-    perspective_points.push_back(Point(0, 0));
-    perspective_points.push_back(Point(0, max_length_norm));
-    perspective_points.push_back(Point(max_length_norm, max_length_norm));
-    perspective_points.push_back(Point(max_length_norm, 0));
-
-    // warpPerspective(bin_barcode, straight_barcode,
-    //                 findHomography(transformation_points, perspective_points),
-    //                 Size(max_length_norm, max_length_norm));
+    for (size_t i = 0; i < localization_points.size(); i++)
+    {
+        for (size_t j = i + 1; j < localization_points.size(); j++)
+        {
+            if (norm(localization_points[i] - localization_points[j]) < 10)
+            {
+                return false;
+            }
+        }
+    }
     return true;
+
 }
 
-Point QRDecode::getTransformationPoint(Point left, Point center, double cos_angle_rotation,
-                                       bool right_rotate)
+bool QRDecode::computeTransformationPoints()
 {
-    Point temp_pnt, prev_pnt(0, 0), next_pnt, start_pnt(center);
-    double temp_delta, min_delta;
-    int steps = 0;
+    if (localization_points.size() != 3) { return false; }
 
-    future_pixel = 255;
-    while(true)
+    vector<Point> locations, non_zero_elem[3], newHull;
+    vector<Point2f> new_non_zero_elem[3];
+    for (size_t i = 0; i < 3; i++)
     {
-        min_delta = std::numeric_limits<double>::max();
-        for (int i = -1; i < 2; i++)
+        Mat mask = Mat::zeros(bin_barcode.rows + 2, bin_barcode.cols + 2, CV_8UC1);
+        uint8_t next_pixel, future_pixel = 255;
+        int count_test_lines = 0, index = static_cast<int>(localization_points[i].x);
+        for (; index < bin_barcode.cols - 1; index++)
         {
-            for (int j = -1; j < 2; j++)
+            next_pixel = bin_barcode.at<uint8_t>(
+                            static_cast<int>(localization_points[i].y), index + 1);
+            if (next_pixel == future_pixel)
             {
-                if (i == 0 && j == 0) { continue; }
-                temp_pnt = Point(start_pnt.x + i, start_pnt.y + j);
-                temp_delta = abs(getCosVectors(left, center, temp_pnt) - cos_angle_rotation);
-                if (temp_delta < min_delta && prev_pnt != temp_pnt)
+                future_pixel = 255 - future_pixel;
+                count_test_lines++;
+                if (count_test_lines == 2)
                 {
-                    next_pnt = temp_pnt;
-                    min_delta  = temp_delta;
+                    floodFill(bin_barcode, mask,
+                              Point(index + 1, static_cast<int>(localization_points[i].y)), 255,
+                              0, Scalar(), Scalar(), FLOODFILL_MASK_ONLY);
+                    break;
                 }
             }
         }
-        prev_pnt = start_pnt;
-        start_pnt = next_pnt;
-        next_pixel = bin_barcode.at<uint8_t>(start_pnt.y, start_pnt.x);
-        if (next_pixel == future_pixel)
+        Mat mask_roi = mask(Range(1, bin_barcode.rows - 1), Range(1, bin_barcode.cols - 1));
+        findNonZero(mask_roi, non_zero_elem[i]);
+        newHull.insert(newHull.end(), non_zero_elem[i].begin(), non_zero_elem[i].end());
+    }
+    convexHull(Mat(newHull), locations);
+    for (size_t i = 0; i < locations.size(); i++)
+    {
+        for (size_t j = 0; j < 3; j++)
+        {
+            for (size_t k = 0; k < non_zero_elem[j].size(); k++)
+            {
+                if (locations[i] == non_zero_elem[j][k])
+                {
+                    new_non_zero_elem[j].push_back(locations[i]);
+                }
+            }
+        }
+    }
+
+    double pentagon_diag_norm = -1;
+    Point2f down_left_edge_point, up_right_edge_point, up_left_edge_point;
+    for (size_t i = 0; i < new_non_zero_elem[1].size(); i++)
+    {
+        for (size_t j = 0; j < new_non_zero_elem[2].size(); j++)
+        {
+            double temp_norm = norm(new_non_zero_elem[1][i] - new_non_zero_elem[2][j]);
+            if (temp_norm > pentagon_diag_norm)
+            {
+                down_left_edge_point = new_non_zero_elem[1][i];
+                up_right_edge_point  = new_non_zero_elem[2][j];
+                pentagon_diag_norm = temp_norm;
+            }
+        }
+    }
+    if (down_left_edge_point == Point2f(0, 0) ||
+        up_right_edge_point  == Point2f(0, 0)) { return false; }
+
+    double max_area = -1;
+    up_left_edge_point = new_non_zero_elem[0][0];
+    for (size_t i = 0; i < new_non_zero_elem[0].size(); i++)
+    {
+        double temp_area = getTriangleArea(new_non_zero_elem[0][i],
+                                           down_left_edge_point,
+                                           up_right_edge_point);
+        if (max_area < temp_area)
+        {
+            up_left_edge_point = new_non_zero_elem[0][i];
+            max_area = temp_area;
+        }
+    }
+
+    Point2f down_max_delta_point, up_max_delta_point;
+    double norm_down_max_delta = -1, norm_up_max_delta = -1;
+    for (size_t i = 0; i < new_non_zero_elem[1].size(); i++)
+    {
+        double temp_norm_delta = norm(up_left_edge_point - new_non_zero_elem[1][i])
+                               + norm(down_left_edge_point - new_non_zero_elem[1][i]);
+        if (norm_down_max_delta < temp_norm_delta)
         {
-            future_pixel = 255 - future_pixel;
-            steps++;
-            if (steps == 3) { break; }
+            down_max_delta_point = new_non_zero_elem[1][i];
+            norm_down_max_delta = temp_norm_delta;
         }
     }
 
-    if (cos_angle_rotation == 0.0)
+    for (size_t i = 0; i < new_non_zero_elem[2].size(); i++)
     {
-        Mat vector_mult(Size(3, 3), CV_32FC1);
-        vector_mult.at<float>(0, 0) = 1;
-        vector_mult.at<float>(1, 0) = 1;
-        vector_mult.at<float>(2, 0) = 1;
-        vector_mult.at<float>(0, 1) = static_cast<float>((left - center).x);
-        vector_mult.at<float>(1, 1) = static_cast<float>((left - center).y);
-        vector_mult.at<float>(0, 2) = static_cast<float>((left - start_pnt).x);
-        vector_mult.at<float>(1, 2) = static_cast<float>((left - start_pnt).y);
-        double res_vect_mult = determinant(vector_mult);
-        if (( right_rotate && res_vect_mult < 0) ||
-            (!right_rotate && res_vect_mult > 0))
+        double temp_norm_delta = norm(up_left_edge_point - new_non_zero_elem[2][i])
+                               + norm(up_right_edge_point - new_non_zero_elem[2][i]);
+        if (norm_up_max_delta < temp_norm_delta)
         {
-            start_pnt = getTransformationPoint(start_pnt, center, -1);
+            up_max_delta_point = new_non_zero_elem[2][i];
+            norm_up_max_delta = temp_norm_delta;
         }
     }
 
-    return start_pnt;
+    transformation_points.push_back(down_left_edge_point);
+    transformation_points.push_back(up_left_edge_point);
+    transformation_points.push_back(up_right_edge_point);
+    transformation_points.push_back(
+        intersectionLines(down_left_edge_point, down_max_delta_point,
+                          up_right_edge_point, up_max_delta_point));
+
+    vector<Point2f> quadrilateral = getQuadrilateral(transformation_points);
+    transformation_points = quadrilateral;
+
+    return true;
 }
 
-Point QRDecode::intersectionLines(Point a1, Point a2, Point b1, Point b2)
+Point2f QRDecode::intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2)
 {
-    Point result_square_angle(
-      static_cast<int>(
-        static_cast<double>
-        ((a1.x * a2.y  -  a1.y * a2.x) * (b1.x - b2.x) -
-         (b1.x * b2.y  -  b1.y * b2.x) * (a1.x - a2.x)) /
-        ((a1.x - a2.x) * (b1.y - b2.y) -
-         (a1.y - a2.y) * (b1.x - b2.x))),
-      static_cast<int>(
-        static_cast<double>
-        ((a1.x * a2.y  -  a1.y * a2.x) * (b1.y - b2.y) -
-         (b1.x * b2.y  -  b1.y * b2.x) * (a1.y - a2.y)) /
-        ((a1.x - a2.x) * (b1.y - b2.y) -
-         (a1.y - a2.y) * (b1.x - b2.x)))
-    );
+    Point2f result_square_angle(
+                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.x - b2.x) -
+                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.x - a2.x)) /
+                              ((a1.x - a2.x) * (b1.y - b2.y) -
+                               (a1.y - a2.y) * (b1.x - b2.x)),
+                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.y - b2.y) -
+                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.y - a2.y)) /
+                              ((a1.x - a2.x) * (b1.y - b2.y) -
+                               (a1.y - a2.y) * (b1.x - b2.x))
+                              );
     return result_square_angle;
 }
 
-std::vector<Point> QRDecode::getQuadrilateral(std::vector<Point> angle_list)
+// test function (if true then ------> else <------ )
+bool QRDecode::testBypassRoute(vector<Point2f> hull, int start, int finish)
+{
+    int index_hull = start, next_index_hull, hull_size = (int)hull.size();
+    double test_length[2] = { 0.0, 0.0 };
+    do
+    {
+        next_index_hull = index_hull + 1;
+        if (next_index_hull == hull_size) { next_index_hull = 0; }
+        test_length[0] += norm(hull[index_hull] - hull[next_index_hull]);
+        index_hull = next_index_hull;
+    }
+    while(index_hull != finish);
+
+    index_hull = start;
+    do
+    {
+        next_index_hull = index_hull - 1;
+        if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
+        test_length[1] += norm(hull[index_hull] - hull[next_index_hull]);
+        index_hull = next_index_hull;
+    }
+    while(index_hull != finish);
+
+    if (test_length[0] < test_length[1]) { return true; } else { return false; }
+}
+
+vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
 {
     size_t angle_size = angle_list.size();
     uint8_t value, mask_value;
-    Mat mask(bin_barcode.rows + 2, bin_barcode.cols + 2, CV_8UC1);
+    Mat mask = Mat::zeros(bin_barcode.rows + 2, bin_barcode.cols + 2, CV_8UC1);
+    Mat fill_bin_barcode = bin_barcode.clone();
     for (size_t i = 0; i < angle_size; i++)
     {
         LineIterator line_iter(bin_barcode, angle_list[ i      % angle_size],
@@ -475,119 +457,93 @@ std::vector<Point> QRDecode::getQuadrilateral(std::vector<Point> angle_list)
             mask_value = mask.at<uint8_t>(line_iter.pos() + Point(1, 1));
             if (value == 0 && mask_value == 0)
             {
-                floodFill(bin_barcode, mask, line_iter.pos(), 255);
+                floodFill(fill_bin_barcode, mask, line_iter.pos(), 255,
+                          0, Scalar(), Scalar(), FLOODFILL_MASK_ONLY);
             }
         }
     }
-    std::vector<Point> locations;
-    Mat mask_roi = mask(Range(1, bin_barcode.rows - 1),
-                        Range(1, bin_barcode.cols - 1));
+    vector<Point> locations;
+    Mat mask_roi = mask(Range(1, bin_barcode.rows - 1), Range(1, bin_barcode.cols - 1));
 
     cv::findNonZero(mask_roi, locations);
 
     for (size_t i = 0; i < angle_list.size(); i++)
     {
-        locations.push_back(angle_list[i]);
+        int x = static_cast<int>(angle_list[i].x);
+        int y = static_cast<int>(angle_list[i].y);
+        locations.push_back(Point(x, y));
     }
 
-    std::vector< std::vector<Point> > hull(1), approx_hull(1);
-    convexHull(Mat(locations), hull[0]);
-    int hull_size = static_cast<int>(hull[0].size());
-
-    Point min_pnt;
-
-    std::vector<Point> min_abc;
-    double min_abs_cos_abc, abs_cos_abc;
-    for (int count = 0; count < 4; count++)
+    vector<Point> integer_hull;
+    convexHull(Mat(locations), integer_hull);
+    int hull_size = (int)integer_hull.size();
+    vector<Point2f> hull(hull_size);
+    for (int i = 0; i < hull_size; i++)
     {
-        min_abs_cos_abc = std::numeric_limits<double>::max();
-        for (int i = 0; i < hull_size; i++)
-        {
-            Point a = hull[0][ i      % hull_size];
-            Point b = hull[0][(i + 1) % hull_size];
-            Point c = hull[0][(i + 2) % hull_size];
-            abs_cos_abc = abs(getCosVectors(a, b, c));
-
-            bool flag_detect = true;
-            for (size_t j = 0; j < min_abc.size(); j++)
-            {
-                if (min_abc[j] == b) { flag_detect = false; break; }
-            }
-
-            if (flag_detect && (abs_cos_abc < min_abs_cos_abc))
-            {
-                min_pnt = b;
-                min_abs_cos_abc = abs_cos_abc;
-            }
-        }
-        min_abc.push_back(min_pnt);
+        float x = static_cast<float>(integer_hull[i].x);
+        float y = static_cast<float>(integer_hull[i].y);
+        hull[i] = Point2f(x, y);
     }
 
+    const double experimental_area = getPolygonArea(hull);
 
-    int min_abc_size = static_cast<int>(min_abc.size());
-    std::vector<int> index_min_abc(min_abc_size);
-    for (int i = 0; i < min_abc_size; i++)
-    {
-        for (int j = 0; j < hull_size; j++)
-        {
-            if (hull[0][j] == min_abc[i]) { index_min_abc[i] = j; break; }
-        }
-    }
-
-    std::vector<Point> result_hull_point(angle_size);
-    double min_norm, temp_norm;
+    vector<Point2f> result_hull_point(angle_size);
+    double min_norm;
     for (size_t i = 0; i < angle_size; i++)
     {
         min_norm = std::numeric_limits<double>::max();
         Point closest_pnt;
-        for (int j = 0; j < min_abc_size; j++)
+        for (int j = 0; j < hull_size; j++)
         {
-            if (min_norm > norm(hull[0][index_min_abc[j]] - angle_list[i]))
+            double temp_norm = norm(hull[j] - angle_list[i]);
+            if (min_norm > temp_norm)
             {
-                min_norm = norm(hull[0][index_min_abc[j]] - angle_list[i]);
-                closest_pnt = hull[0][index_min_abc[j]];
+                min_norm = temp_norm;
+                closest_pnt = hull[j];
             }
         }
         result_hull_point[i] = closest_pnt;
     }
 
-    int start_line[2] = {0, 0}, finish_line[2] = {0, 0}, unstable_pnt = 0;
+    int start_line[2] = { 0, 0 }, finish_line[2] = { 0, 0 }, unstable_pnt = 0;
     for (int i = 0; i < hull_size; i++)
     {
-        if (result_hull_point[3] == hull[0][i]) { start_line[0] = i; }
-        if (result_hull_point[2] == hull[0][i]) { finish_line[0] = start_line[1] = i; }
-        if (result_hull_point[1] == hull[0][i]) { finish_line[1] = i; }
-        if (result_hull_point[0] == hull[0][i]) { unstable_pnt = i; }
+        if (result_hull_point[2] == hull[i]) { start_line[0] = i; }
+        if (result_hull_point[1] == hull[i]) { finish_line[0] = start_line[1] = i; }
+        if (result_hull_point[0] == hull[i]) { finish_line[1] = i; }
+        if (result_hull_point[3] == hull[i]) { unstable_pnt = i; }
     }
 
-    int index_hull, extra_index_hull, next_index_hull, extra_next_index_hull, count_points;
+    int index_hull, extra_index_hull, next_index_hull, extra_next_index_hull;
     Point result_side_begin[4], result_side_end[4];
 
+    bool bypass_orientation = testBypassRoute(hull, start_line[0], finish_line[0]);
+    bool extra_bypass_orientation;
+
     min_norm = std::numeric_limits<double>::max();
     index_hull = start_line[0];
-    count_points = abs(start_line[0] - finish_line[0]);
     do
     {
-        if (count_points > hull_size / 2) { next_index_hull = index_hull + 1; }
+        if (bypass_orientation) { next_index_hull = index_hull + 1; }
         else { next_index_hull = index_hull - 1; }
 
         if (next_index_hull == hull_size) { next_index_hull = 0; }
         if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
 
-        Point angle_closest_pnt =  norm(hull[0][index_hull] - angle_list[2]) >
-          norm(hull[0][index_hull] - angle_list[3]) ? angle_list[3] : angle_list[2];
+        Point angle_closest_pnt =  norm(hull[index_hull] - angle_list[1]) >
+        norm(hull[index_hull] - angle_list[2]) ? angle_list[2] : angle_list[1];
 
         Point intrsc_line_hull =
-          intersectionLines(hull[0][index_hull], hull[0][next_index_hull],
-                            angle_list[2], angle_list[3]);
-        temp_norm = getCosVectors(hull[0][index_hull], intrsc_line_hull, angle_closest_pnt);
+        intersectionLines(hull[index_hull], hull[next_index_hull],
+                          angle_list[1], angle_list[2]);
+        double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
         if (min_norm > temp_norm &&
-            norm(hull[0][index_hull] - hull[0][next_index_hull]) >
-            norm(angle_list[2] - angle_list[3]) / 10)
+            norm(hull[index_hull] - hull[next_index_hull]) >
+            norm(angle_list[1] - angle_list[2]) / 10)
         {
             min_norm = temp_norm;
-            result_side_begin[0] = hull[0][index_hull];
-            result_side_end[0]   = hull[0][next_index_hull];
+            result_side_begin[0] = hull[index_hull];
+            result_side_end[0]   = hull[next_index_hull];
         }
 
 
@@ -597,104 +553,96 @@ std::vector<Point> QRDecode::getQuadrilateral(std::vector<Point> angle_list)
 
     if (min_norm == std::numeric_limits<double>::max())
     {
-        result_side_begin[0] = angle_list[2];
-        result_side_end[0]   = angle_list[3];
+        result_side_begin[0] = angle_list[1];
+        result_side_end[0]   = angle_list[2];
     }
 
     min_norm = std::numeric_limits<double>::max();
     index_hull = start_line[1];
-    count_points = abs(start_line[1] - finish_line[1]);
+    bypass_orientation = testBypassRoute(hull, start_line[1], finish_line[1]);
     do
     {
-        if (count_points > hull_size / 2) { next_index_hull = index_hull + 1; }
+        if (bypass_orientation) { next_index_hull = index_hull + 1; }
         else { next_index_hull = index_hull - 1; }
 
         if (next_index_hull == hull_size) { next_index_hull = 0; }
         if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
 
-        Point angle_closest_pnt =  norm(hull[0][index_hull] - angle_list[1]) >
-          norm(hull[0][index_hull] - angle_list[2]) ? angle_list[2] : angle_list[1];
+        Point angle_closest_pnt =  norm(hull[index_hull] - angle_list[0]) >
+        norm(hull[index_hull] - angle_list[1]) ? angle_list[1] : angle_list[0];
 
         Point intrsc_line_hull =
-          intersectionLines(hull[0][index_hull], hull[0][next_index_hull],
-                            angle_list[1], angle_list[2]);
-        temp_norm = getCosVectors(hull[0][index_hull], intrsc_line_hull, angle_closest_pnt);
+        intersectionLines(hull[index_hull], hull[next_index_hull],
+                          angle_list[0], angle_list[1]);
+        double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
         if (min_norm > temp_norm &&
-            norm(hull[0][index_hull] - hull[0][next_index_hull]) >
-            norm(angle_list[1] - angle_list[2]) / 20)
+            norm(hull[index_hull] - hull[next_index_hull]) >
+            norm(angle_list[0] - angle_list[1]) / 20)
         {
             min_norm = temp_norm;
-            result_side_begin[1] = hull[0][index_hull];
-            result_side_end[1]   = hull[0][next_index_hull];
+            result_side_begin[1] = hull[index_hull];
+            result_side_end[1]   = hull[next_index_hull];
         }
 
-
         index_hull = next_index_hull;
     }
     while(index_hull != finish_line[1]);
 
     if (min_norm == std::numeric_limits<double>::max())
     {
-        result_side_begin[1] = angle_list[1];
-        result_side_end[1]   = angle_list[2];
+        result_side_begin[1] = angle_list[0];
+        result_side_end[1]   = angle_list[1];
     }
 
-    double test_norm[4] = { 0.0, 0.0, 0.0, 0.0 };
-    int test_index[4];
-    for (int i = 0; i < 4; i++)
-    {
-        test_index[i] = (i < 2) ? static_cast<int>(start_line[0])
-                                : static_cast<int>(finish_line[1]);
-        do
-        {
-            next_index_hull = ((i + 1) % 2 != 0) ? test_index[i] + 1 : test_index[i] - 1;
-            if (next_index_hull == hull_size) { next_index_hull = 0; }
-            if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
-            test_norm[i] += norm(hull[0][next_index_hull] - hull[0][unstable_pnt]);
-            test_index[i] = next_index_hull;
-        }
-        while(test_index[i] != unstable_pnt);
-    }
+    bypass_orientation = testBypassRoute(hull, start_line[0], unstable_pnt);
+    extra_bypass_orientation = testBypassRoute(hull, finish_line[1], unstable_pnt);
 
-    std::vector<Point> result_angle_list(4), test_result_angle_list(4);
-    double min_area = std::numeric_limits<double>::max(), test_area;
+    vector<Point2f> result_angle_list(4), test_result_angle_list(4);
+    double min_diff_area = std::numeric_limits<double>::max(), test_diff_area;
     index_hull = start_line[0];
+    double standart_norm = std::max(
+        norm(result_side_begin[0] - result_side_end[0]),
+        norm(result_side_begin[1] - result_side_end[1]));
     do
     {
-        if (test_norm[0] < test_norm[1]) { next_index_hull = index_hull + 1; }
+        if (bypass_orientation) { next_index_hull = index_hull + 1; }
         else { next_index_hull = index_hull - 1; }
 
         if (next_index_hull == hull_size) { next_index_hull = 0; }
         if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
 
+        if (norm(hull[index_hull] - hull[next_index_hull]) < standart_norm / 10.0)
+        { index_hull = next_index_hull; continue; }
+
         extra_index_hull = finish_line[1];
         do
         {
-            if (test_norm[2] < test_norm[3]) { extra_next_index_hull = extra_index_hull + 1; }
+            if (extra_bypass_orientation) { extra_next_index_hull = extra_index_hull + 1; }
             else { extra_next_index_hull = extra_index_hull - 1; }
 
             if (extra_next_index_hull == hull_size) { extra_next_index_hull = 0; }
             if (extra_next_index_hull == -1) { extra_next_index_hull = hull_size - 1; }
 
+            if (norm(hull[extra_index_hull] - hull[extra_next_index_hull]) < standart_norm / 10.0)
+            { extra_index_hull = extra_next_index_hull; continue; }
+
             test_result_angle_list[0]
-                = intersectionLines(result_side_begin[0], result_side_end[0],
-                                    result_side_begin[1], result_side_end[1]);
+            = intersectionLines(result_side_begin[0], result_side_end[0],
+                                result_side_begin[1], result_side_end[1]);
             test_result_angle_list[1]
-                = intersectionLines(result_side_begin[1], result_side_end[1],
-                                    hull[0][extra_index_hull], hull[0][extra_next_index_hull]);
+            = intersectionLines(result_side_begin[1], result_side_end[1],
+                                hull[extra_index_hull], hull[extra_next_index_hull]);
             test_result_angle_list[2]
-                = intersectionLines(hull[0][extra_index_hull], hull[0][extra_next_index_hull],
-                                    hull[0][index_hull], hull[0][next_index_hull]);
+            = intersectionLines(hull[extra_index_hull], hull[extra_next_index_hull],
+                                hull[index_hull], hull[next_index_hull]);
             test_result_angle_list[3]
-                = intersectionLines(hull[0][index_hull], hull[0][next_index_hull],
-                                    result_side_begin[0], result_side_end[0]);
-            test_area = getQuadrilateralArea(test_result_angle_list[0],
-                                             test_result_angle_list[1],
-                                             test_result_angle_list[2],
-                                             test_result_angle_list[3]);
-            if (min_area > test_area)
+            = intersectionLines(hull[index_hull], hull[next_index_hull],
+                                result_side_begin[0], result_side_end[0]);
+
+            test_diff_area = fabs(getPolygonArea(test_result_angle_list) - experimental_area);
+            if (min_diff_area > test_diff_area)
             {
-                min_area = test_area;
+                min_diff_area = test_diff_area;
                 for (size_t i = 0; i < test_result_angle_list.size(); i++)
                 {
                     result_angle_list[i] = test_result_angle_list[i];
@@ -708,48 +656,42 @@ std::vector<Point> QRDecode::getQuadrilateral(std::vector<Point> angle_list)
         index_hull = next_index_hull;
     }
     while(index_hull != unstable_pnt);
-
-    if (norm(result_angle_list[0] - angle_list[2]) >
-        norm(angle_list[2] - angle_list[1]) / 3) { result_angle_list[0] = angle_list[2]; }
-
-    if (norm(result_angle_list[1] - angle_list[1]) >
-        norm(angle_list[1] - angle_list[0]) / 3) { result_angle_list[1] = angle_list[1]; }
-
-    if (norm(result_angle_list[2] - angle_list[0]) >
-        norm(angle_list[0] - angle_list[3]) / 3) { result_angle_list[2] = angle_list[0]; }
-
-    if (norm(result_angle_list[3] - angle_list[3]) >
-        norm(angle_list[3] - angle_list[2]) / 3) { result_angle_list[3] = angle_list[3]; }
-
-
-
     return result_angle_list;
 }
 
-//        b __________ c
-//        /           |
-//       /            |
-//      /      S      |
-//     /              |
-//   a --------------- d
+//          b
+//         / |
+//        /  |
+//       /   |
+//      /  S |
+//     /     |
+//   a ----- c
 
-double QRDecode::getQuadrilateralArea(Point a, Point b, Point c, Point d)
+double QRDecode::getTriangleArea(Point2f a, Point2f b, Point2f c)
 {
-    double length_sides[4], perimeter = 0.0, result_area = 1.0;
-    length_sides[0] = norm(a - b); length_sides[1] = norm(b - c);
-    length_sides[2] = norm(c - d); length_sides[3] = norm(d - a);
-
-    for (int i = 0; i < 4; i++) { perimeter += length_sides[i]; }
-    perimeter /= 2;
+    double norm_sides[] = { norm(a - b), norm(b - c), norm(c - a) };
+    double half_perimeter = (norm_sides[0] + norm_sides[1] + norm_sides[2]) / 2.0;
+    double triangle_area = sqrt(half_perimeter *
+                               (half_perimeter - norm_sides[0]) *
+                               (half_perimeter - norm_sides[1]) *
+                               (half_perimeter - norm_sides[2]));
+    return triangle_area;
+}
 
-    for (int i = 0; i < 4; i++)
+double QRDecode::getPolygonArea(vector<Point2f> points)
+{
+    CV_Assert(points.size() >= 3);
+    if (points.size() == 3)
+    { return getTriangleArea(points[0], points[1], points[2]); }
+    else
     {
-        result_area *= (perimeter - length_sides[i]);
+        double result_area = 0.0;
+        for (size_t i = 1; i < points.size() - 1; i++)
+        {
+            result_area += getTriangleArea(points[0], points[i], points[i + 1]);
+        }
+        return result_area;
     }
-
-    result_area = sqrt(result_area);
-
-    return result_area;
 }
 
 //      / | b
@@ -757,22 +699,86 @@ double QRDecode::getQuadrilateralArea(Point a, Point b, Point c, Point d)
 //    /   |
 //  a/    | c
 
-double QRDecode::getCosVectors(Point a, Point b, Point c)
+double QRDecode::getCosVectors(Point2f a, Point2f b, Point2f c)
 {
-    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y) / (norm(a - b) * norm(c - b));
+    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y)
+            / (norm(a - b) * norm(c - b));
 }
 
-CV_EXPORTS bool detectQRCode(InputArray in, std::vector<Point> &points, double eps_x, double eps_y)
+bool QRDecode::transformation()
 {
-    CV_Assert(in.isMat());
-    CV_Assert(in.getMat().type() == CV_8UC1);
+    if(!computeTransformationPoints()) { return false; }
+
+    double max_length_norm = -1;
+    size_t transform_size = transformation_points.size();
+    for (size_t i = 0; i < transform_size; i++)
+    {
+        double len_norm = norm(transformation_points[i % transform_size] -
+                               transformation_points[(i + 1) % transform_size]);
+        max_length_norm = std::max(max_length_norm, len_norm);
+    }
+
+    Point2f transformation_points_[] =
+    {
+        transformation_points[0],
+        transformation_points[1],
+        transformation_points[2],
+        transformation_points[3]
+    };
+
+    Point2f perspective_points[] =
+    {
+        Point2f(0.f, 0.f), Point2f(0.f, (float)max_length_norm),
+        Point2f((float)max_length_norm, (float)max_length_norm),
+        Point2f((float)max_length_norm, 0.f)
+    };
+
+    Mat H = getPerspectiveTransform(transformation_points_, perspective_points);
+
+    warpPerspective(bin_barcode, straight_barcode, H,
+                    Size(static_cast<int>(max_length_norm),
+                         static_cast<int>(max_length_norm)));
+    return true;
+}
+
+
+struct QRCodeDetector::Impl
+{
+public:
+    Impl() { epsX = 0.2; epsY = 0.1; }
+    ~Impl() {}
+
+    double epsX, epsY;
+};
+
+QRCodeDetector::QRCodeDetector() : p(new Impl) {}
+QRCodeDetector::~QRCodeDetector() {}
+
+void QRCodeDetector::setEpsX(double epsX) { p->epsX = epsX; }
+void QRCodeDetector::setEpsY(double epsY) { p->epsY = epsY; }
+
+bool QRCodeDetector::detect(InputArray in, OutputArray points) const
+{
+    Mat inarr = in.getMat();
+    CV_Assert(!inarr.empty());
+    CV_Assert(inarr.type() == CV_8UC1);
     QRDecode qrdec;
-    qrdec.init(in.getMat(), eps_x, eps_y);
+    qrdec.init(inarr, p->epsX, p->epsY);
     qrdec.binarization();
     if (!qrdec.localization()) { return false; }
     if (!qrdec.transformation()) { return false; }
-    points = qrdec.getTransformationPoints();
+    vector<Point2f> pnts2f = qrdec.getTransformationPoints();
+    Mat(pnts2f).convertTo(points, points.fixedType() ? points.type() : CV_32FC2);
     return true;
 }
 
+CV_EXPORTS bool detectQRCode(InputArray in, std::vector<Point> &points, double eps_x, double eps_y)
+{
+    QRCodeDetector qrdetector;
+    qrdetector.setEpsX(eps_x);
+    qrdetector.setEpsY(eps_y);
+
+    return qrdetector.detect(in, points);
+}
+
 }
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index 87f5ce525b..82e9990530 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -1,74 +1,115 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 
 #include "test_precomp.hpp"
 
-namespace opencv_test { namespace {
 
-TEST(Objdetect_QRCode, regression)
+namespace opencv_test
+{
+
+String qrcode_images_name[] = {
+    "20110817_030.jpg",
+    "20110817_048.jpg",
+    "img_20120226_161648.jpg",
+    "img_2714.jpg",
+    "img_2716.jpg",
+    "img_3011.jpg",
+    "img_3029.jpg",
+    "img_3070.jpg",
+    "qr_test_030.jpg"
+};
+
+// #define UPDATE_QRCODE_TEST_DATA
+#ifdef  UPDATE_QRCODE_TEST_DATA
+
+TEST(Objdetect_QRCode, generate_test_data)
 {
     String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
-    // String cascades[] =
-    // {
-        // root + "haarcascade_frontalface_alt.xml",
-        // root + "lbpcascade_frontalface.xml",
-        // String()
-    // };
-
-    // vector<Rect> objects;
-    // RNG rng((uint64)-1);
-
-    // for( int i = 0; !cascades[i].empty(); i++ )
-    // {
-        // printf("%d. %s\n", i, cascades[i].c_str());
-        // CascadeClassifier cascade(cascades[i]);
-        // for( int j = 0; j < 100; j++ )
-        // {
-            // int width = rng.uniform(1, 100);
-            // int height = rng.uniform(1, 100);
-            // Mat img(height, width, CV_8U);
-            // randu(img, 0, 256);
-            // cascade.detectMultiScale(img, objects);
-        // }
-    // }
+    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
+    FileStorage file_config(dataset_config, FileStorage::WRITE);
+
+    file_config << "test_images" << "[";
+    size_t images_count = sizeof(qrcode_images_name) / sizeof(String);
+    for (size_t i = 0; i < images_count; i++)
+    {
+        file_config << "{:" << "image_name" << qrcode_images_name[i];
+        String image_path = root + qrcode_images_name[i];
+        std::vector<Point> transform;
+        Mat src = imread(image_path, IMREAD_GRAYSCALE);
+        EXPECT_TRUE(detectQRCode(src, transform));
+        file_config << "x" << "[:";
+        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].x; }
+        file_config << "]";
+        file_config << "y" << "[:";
+        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].y; }
+        file_config << "]" << "}";
+    }
+    file_config << "]";
+    file_config.release();
 }
 
-}} // namespace
+#else
+
+typedef testing::TestWithParam< String > Objdetect_QRCode;
+TEST_P(Objdetect_QRCode, regression)
+{
+    String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
+    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
+    FileStorage file_config(dataset_config, FileStorage::READ);
+    const int pixels_error = 3;
+
+    std::vector<Point> corners;
+    String image_path = root + String(GetParam());
+    Mat src = imread(image_path, IMREAD_GRAYSCALE);
+    EXPECT_TRUE(detectQRCode(src, corners));
+
+    if (file_config.isOpened())
+    {
+        FileNode images_list = file_config["test_images"];
+        int index = 0, images_count = static_cast<int>(images_list.size());
+        ASSERT_GT(images_count, 0);
+
+        bool runTestsFlag = false;
+        String name_current_image = String(GetParam());
+        for (; index < images_count; index++)
+        {
+            String name_test_image = images_list[index]["image_name"];
+            if (name_test_image == name_current_image)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    int x = images_list[index]["x"][i];
+                    int y = images_list[index]["y"][i];
+                    EXPECT_NEAR(x, corners[i].x, pixels_error);
+                    EXPECT_NEAR(y, corners[i].y, pixels_error);
+                }
+                runTestsFlag = true;
+            }
+        }
+        if (!runTestsFlag)
+        {
+            std::cout << "Not found results for " << name_current_image;
+            std::cout << " image in dataset_config.json file." << std::endl;
+        }
+
+        file_config.release();
+    }
+    else
+    {
+        std::cout << " Not found dataset_config.json file." << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(objdetect, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name));
+
+TEST(Objdetect_QRCode, not_found_qrcode)
+{
+    std::vector<Point> corners;
+    Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
+    EXPECT_FALSE(detectQRCode(zero_image, corners));
+}
+
+#endif
+
+} // namespace
diff --git a/modules/photo/src/tonemap.cpp b/modules/photo/src/tonemap.cpp
index 2911fc55c8..053360f4c2 100644
--- a/modules/photo/src/tonemap.cpp
+++ b/modules/photo/src/tonemap.cpp
@@ -140,6 +140,7 @@ public:
 
         double max;
         minMaxLoc(gray_img, NULL, &max);
+        CV_Assert(max > 0);
 
         Mat map;
         log(gray_img + 1.0f, map);
@@ -429,12 +430,15 @@ public:
         for(int i = 0; i < max_iterations; i++)
         {
             calculateProduct(p, product);
-            float alpha = rr / static_cast<float>(p.dot(product));
+            double dprod = p.dot(product);
+            CV_Assert(fabs(dprod) > 0);
+            float alpha = rr / static_cast<float>(dprod);
 
             r -= alpha * product;
             x += alpha * p;
 
             float new_rr = static_cast<float>(r.dot(r));
+            CV_Assert(fabs(rr) > 0);
             p = r + (new_rr / rr) * p;
             rr = new_rr;
 
diff --git a/modules/shape/src/sc_dis.cpp b/modules/shape/src/sc_dis.cpp
index 26dd459378..cf4f9fe3a0 100644
--- a/modules/shape/src/sc_dis.cpp
+++ b/modules/shape/src/sc_dis.cpp
@@ -743,6 +743,7 @@ void SCDMatcher::hungarian(cv::Mat &costMatrix, std::vector<cv::DMatch> &outMatc
 
     // calculate symmetric shape context cost
     cv::Mat trueCostMatrix(costMatrix, cv::Rect(0,0,sizeScd1, sizeScd2));
+    CV_Assert(!trueCostMatrix.empty());
     float leftcost = 0;
     for (int nrow=0; nrow<trueCostMatrix.rows; nrow++)
     {
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 8c5d6f3d54..f6dbc6472b 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2125,7 +2125,7 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
     switch( code )
     {
     case CMP_EPS_BIG_DIFF:
-        sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
+        sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
         code = TS::FAIL_BAD_ACCURACY;
         break;
     case CMP_EPS_INVALID_TEST_DATA:
diff --git a/modules/videoio/include/opencv2/videoio/container_avi.private.hpp b/modules/videoio/include/opencv2/videoio/container_avi.private.hpp
index 61ea74dece..7a13ac45d7 100644
--- a/modules/videoio/include/opencv2/videoio/container_avi.private.hpp
+++ b/modules/videoio/include/opencv2/videoio/container_avi.private.hpp
@@ -153,7 +153,7 @@ public:
     bool initContainer(const String& filename, double fps, Size size, bool iscolor);
     void startWriteAVI(int stream_count);
     void writeStreamHeader(Codecs codec_);
-    void startWriteChunk(int fourcc);
+    void startWriteChunk(uint32_t fourcc);
     void endWriteChunk();
 
     int getAVIIndex(int stream_number, StreamType strm_type);
diff --git a/modules/videoio/src/container_avi.cpp b/modules/videoio/src/container_avi.cpp
index 584751cf75..0c581b3f51 100644
--- a/modules/videoio/src/container_avi.cpp
+++ b/modules/videoio/src/container_avi.cpp
@@ -3,10 +3,29 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "opencv2/videoio/container_avi.private.hpp"
+#include <fstream>
+#include <limits>
+#include <typeinfo>
 
 namespace cv
 {
 
+// Utility function for safe integer conversions
+template <typename D, typename S>
+inline D safe_int_cast(S val)
+{
+    typedef std::numeric_limits<S> st;
+    typedef std::numeric_limits<D> dt;
+    CV_StaticAssert(st::is_integer && dt::is_integer, "Integer type is expected");
+    const bool in_range_r = (double)val <= (double)dt::max();
+    const bool in_range_l = (double)val >= (double)dt::min();
+    if (!in_range_r || !in_range_l)
+    {
+        CV_Error_(cv::Error::StsOutOfRange, ("Can not convert integer values (%s -> %s), value 0x%llx is out of range", typeid(S).name(), typeid(D).name(), val));
+    }
+    return static_cast<D>(val);
+}
+
 const uint32_t RIFF_CC = CV_FOURCC('R','I','F','F');
 const uint32_t LIST_CC = CV_FOURCC('L','I','S','T');
 const uint32_t HDRL_CC = CV_FOURCC('h','d','r','l');
@@ -116,12 +135,15 @@ public:
     bool open(const String& filename);
     void close();
     operator bool();
-    VideoInputStream& operator=(const VideoInputStream& stream);
 
 private:
+    VideoInputStream(const VideoInputStream&);
+    VideoInputStream& operator=(const VideoInputStream&);
+
+private:
+    std::ifstream input;
     bool    m_is_valid;
     String  m_fname;
-    FILE*   m_f;
 };
 
 #pragma pack(pop)
@@ -174,12 +196,12 @@ String fourccToString(uint32_t fourcc)
     return format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
 }
 
-VideoInputStream::VideoInputStream(): m_is_valid(false), m_f(0)
+VideoInputStream::VideoInputStream(): m_is_valid(false)
 {
     m_fname = String();
 }
 
-VideoInputStream::VideoInputStream(const String& filename): m_is_valid(false), m_f(0)
+VideoInputStream::VideoInputStream(const String& filename): m_is_valid(false)
 {
     m_fname = filename;
     open(filename);
@@ -187,17 +209,14 @@ VideoInputStream::VideoInputStream(const String& filename): m_is_valid(false), m
 
 bool VideoInputStream::isOpened() const
 {
-    return m_f != 0;
+    return input.is_open();
 }
 
 bool VideoInputStream::open(const String& filename)
 {
     close();
-
-    m_f = fopen(filename.c_str(), "rb");
-
+    input.open(filename.c_str(), std::ios_base::binary);
     m_is_valid = isOpened();
-
     return m_is_valid;
 }
 
@@ -206,9 +225,7 @@ void VideoInputStream::close()
     if(isOpened())
     {
         m_is_valid = false;
-
-        fclose(m_f);
-        m_f = 0;
+        input.close();
     }
 }
 
@@ -216,7 +233,8 @@ VideoInputStream& VideoInputStream::read(char* buf, uint64_t count)
 {
     if(isOpened())
     {
-        m_is_valid = (count == fread((void*)buf, 1, (size_t)count, m_f));
+        input.read(buf, safe_int_cast<std::streamsize>(count));
+        m_is_valid = (input.gcount() == (std::streamsize)count);
     }
 
     return *this;
@@ -224,14 +242,15 @@ VideoInputStream& VideoInputStream::read(char* buf, uint64_t count)
 
 VideoInputStream& VideoInputStream::seekg(uint64_t pos)
 {
-    m_is_valid = (fseek(m_f, (int32_t)pos, SEEK_SET) == 0);
-
+    input.clear();
+    input.seekg(safe_int_cast<std::streamoff>(pos));
+    m_is_valid = !input.eof();
     return *this;
 }
 
 uint64_t VideoInputStream::tellg()
 {
-    return ftell(m_f);
+    return input.tellg();
 }
 
 VideoInputStream::operator bool()
@@ -239,16 +258,6 @@ VideoInputStream::operator bool()
     return m_is_valid;
 }
 
-VideoInputStream& VideoInputStream::operator=(const VideoInputStream& stream)
-{
-    if (this != &stream) {
-        m_fname = stream.m_fname;
-        // m_f = stream.m_f;
-        open(m_fname);
-    }
-    return *this;
-}
-
 VideoInputStream::~VideoInputStream()
 {
     close();
@@ -591,7 +600,7 @@ public:
     ~BitStream() { close(); }
 
     bool open(const String& filename);
-    bool isOpened() const { return m_f != 0; }
+    bool isOpened() const { return output.is_open(); }
     void close();
 
     void writeBlock();
@@ -600,20 +609,24 @@ public:
     void putBytes(const uchar* buf, int count);
 
     void putShort(int val);
-    void putInt(int val);
+    void putInt(uint32_t val);
     void jputShort(int val);
-    void patchInt(int val, size_t pos);
+    void patchInt(uint32_t val, size_t pos);
     void jput(unsigned currval);
     void jflush(unsigned currval, int bitIdx);
 
+private:
+    BitStream(const BitStream &);
+    BitStream &operator=(const BitStream&);
+
 protected:
+    std::ofstream output;
     std::vector<uchar> m_buf;
     uchar*  m_start;
     uchar*  m_end;
     uchar*  m_current;
     size_t  m_pos;
     bool    m_is_opened;
-    FILE*   m_f;
 };
 
 static const size_t DEFAULT_BLOCK_SIZE = (1 << 15);
@@ -624,7 +637,6 @@ BitStream::BitStream()
     m_start = &m_buf[0];
     m_end = m_start + DEFAULT_BLOCK_SIZE;
     m_is_opened = false;
-    m_f = 0;
     m_current = 0;
     m_pos = 0;
 }
@@ -632,9 +644,7 @@ BitStream::BitStream()
 bool BitStream::open(const String& filename)
 {
     close();
-    m_f = fopen(filename.c_str(), "wb");
-    if( !m_f )
-        return false;
+    output.open(filename.c_str(), std::ios_base::binary);
     m_current = m_start;
     m_pos = 0;
     return true;
@@ -643,25 +653,22 @@ bool BitStream::open(const String& filename)
 void BitStream::close()
 {
     writeBlock();
-    if( m_f )
-        fclose(m_f);
-    m_f = 0;
+    output.close();
 }
 
 void BitStream::writeBlock()
 {
-    size_t wsz0 = m_current - m_start;
-    if( wsz0 > 0 && m_f )
+    ptrdiff_t wsz0 = m_current - m_start;
+    if( wsz0 > 0 )
     {
-        size_t wsz = fwrite(m_start, 1, wsz0, m_f);
-        CV_Assert( wsz == wsz0 );
+        output.write((char*)m_start, wsz0);
     }
     m_pos += wsz0;
     m_current = m_start;
 }
 
 size_t BitStream::getPos() const {
-    return (size_t)(m_current - m_start) + m_pos;
+    return safe_int_cast<size_t>(m_current - m_start) + m_pos;
 }
 
 void BitStream::putByte(int val)
@@ -674,7 +681,7 @@ void BitStream::putByte(int val)
 void BitStream::putBytes(const uchar* buf, int count)
 {
     uchar* data = (uchar*)buf;
-    CV_Assert(m_f && data && m_current && count >= 0);
+    CV_Assert(data && m_current && count >= 0);
     if( m_current >= m_end )
         writeBlock();
 
@@ -706,7 +713,7 @@ void BitStream::putShort(int val)
         writeBlock();
 }
 
-void BitStream::putInt(int val)
+void BitStream::putInt(uint32_t val)
 {
     m_current[0] = (uchar)val;
     m_current[1] = (uchar)(val >> 8);
@@ -726,11 +733,11 @@ void BitStream::jputShort(int val)
         writeBlock();
 }
 
-void BitStream::patchInt(int val, size_t pos)
+void BitStream::patchInt(uint32_t val, size_t pos)
 {
     if( pos >= m_pos )
     {
-        ptrdiff_t delta = pos - m_pos;
+        ptrdiff_t delta = safe_int_cast<ptrdiff_t>(pos - m_pos);
         CV_Assert( delta < m_current - m_start );
         m_start[delta] = (uchar)val;
         m_start[delta+1] = (uchar)(val >> 8);
@@ -739,12 +746,11 @@ void BitStream::patchInt(int val, size_t pos)
     }
     else
     {
-        CV_Assert(pos < (1u<<31));
-        long fpos = ftell(m_f);
-        fseek(m_f, (long)pos, SEEK_SET);
+        std::streamoff fpos = output.tellp();
+        output.seekp(safe_int_cast<std::streamoff>(pos));
         uchar buf[] = { (uchar)val, (uchar)(val >> 8), (uchar)(val >> 16), (uchar)(val >> 24) };
-        fwrite(buf, 1, 4, m_f);
-        fseek(m_f, fpos, SEEK_SET);
+        output.write((char *)buf, 4);
+        output.seekp(fpos);
     }
 }
 
@@ -876,7 +882,7 @@ void AVIWriteContainer::writeStreamHeader(Codecs codec_)
 
     strm->putInt(0);
     strm->putInt(SUG_BUFFER_SIZE);
-    strm->putInt(AVI_DWQUALITY);
+    strm->putInt(static_cast<uint32_t>(AVI_DWQUALITY));
     strm->putInt(0);
     strm->putShort(0);
     strm->putShort(0);
@@ -935,7 +941,7 @@ void AVIWriteContainer::writeStreamHeader(Codecs codec_)
     strm->putInt(MOVI_CC);
 }
 
-void AVIWriteContainer::startWriteChunk(int fourcc)
+void AVIWriteContainer::startWriteChunk(uint32_t fourcc)
 {
     CV_Assert(fourcc != 0);
     strm->putInt(fourcc);
@@ -949,9 +955,12 @@ void AVIWriteContainer::endWriteChunk()
     if( !AVIChunkSizeIndex.empty() )
     {
         size_t currpos = strm->getPos();
+        CV_Assert(currpos > 4);
+        currpos -= 4;
         size_t pospos = AVIChunkSizeIndex.back();
         AVIChunkSizeIndex.pop_back();
-        int chunksz = (int)(currpos - (pospos + 4));
+        CV_Assert(currpos >= pospos);
+        uint32_t chunksz = safe_int_cast<uint32_t>(currpos - pospos);
         strm->patchInt(chunksz, pospos);
     }
 }
@@ -966,8 +975,8 @@ int AVIWriteContainer::getAVIIndex(int stream_number, StreamType strm_type) {
       case dc: return CV_FOURCC(strm_indx[0], strm_indx[1], 'd', 'c');
       case pc: return CV_FOURCC(strm_indx[0], strm_indx[1], 'p', 'c');
       case wb: return CV_FOURCC(strm_indx[0], strm_indx[1], 'w', 'b');
-      default: return CV_FOURCC(strm_indx[0], strm_indx[1], 'd', 'b');
     }
+    return CV_FOURCC(strm_indx[0], strm_indx[1], 'd', 'b');
 }
 
 void AVIWriteContainer::writeIndex(int stream_number, StreamType strm_type)
@@ -987,7 +996,7 @@ void AVIWriteContainer::writeIndex(int stream_number, StreamType strm_type)
 
 void AVIWriteContainer::finishWriteAVI()
 {
-    int nframes = (int)frameOffset.size();
+    uint32_t nframes = safe_int_cast<uint32_t>(frameOffset.size());
     // Record frames numbers to AVI Header
     while (!frameNumIndexes.empty())
     {
diff --git a/modules/videostab/src/inpainting.cpp b/modules/videostab/src/inpainting.cpp
index ea69060b48..3fad005318 100644
--- a/modules/videostab/src/inpainting.cpp
+++ b/modules/videostab/src/inpainting.cpp
@@ -447,7 +447,7 @@ public:
             }
         }
 
-        float wSumInv = 1.f / wSum;
+        float wSumInv = (std::fabs(wSum) > 0) ? (1.f / wSum) : 0; // if wSum is 0, c1-c3 will be 0 too
         frame(y,x) = Point3_<uchar>(
                 static_cast<uchar>(c1*wSumInv),
                 static_cast<uchar>(c2*wSumInv),
diff --git a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
index 49ee1bf215..0e6cb0a6ad 100644
--- a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
@@ -20,29 +20,32 @@ using namespace cv;
  */
 int main( int argc, char** argv )
 {
-    //! [basic-linear-transform-parameters]
-    double alpha = 1.0; /*< Simple contrast control */
-    int beta = 0;       /*< Simple brightness control */
-    //! [basic-linear-transform-parameters]
-
     /// Read image given by user
     //! [basic-linear-transform-load]
-    String imageName("../data/lena.jpg"); // by default
-    if (argc > 1)
+    CommandLineParser parser( argc, argv, "{@input | ../data/lena.jpg | input image}" );
+    Mat image = imread( parser.get<String>( "@input" ) );
+    if( image.empty() )
     {
-        imageName = argv[1];
+      cout << "Could not open or find the image!\n" << endl;
+      cout << "Usage: " << argv[0] << " <Input image>" << endl;
+      return -1;
     }
-    Mat image = imread( imageName );
     //! [basic-linear-transform-load]
+
     //! [basic-linear-transform-output]
     Mat new_image = Mat::zeros( image.size(), image.type() );
     //! [basic-linear-transform-output]
 
+    //! [basic-linear-transform-parameters]
+    double alpha = 1.0; /*< Simple contrast control */
+    int beta = 0;       /*< Simple brightness control */
+
     /// Initialize values
     cout << " Basic Linear Transforms " << endl;
     cout << "-------------------------" << endl;
     cout << "* Enter the alpha value [1.0-3.0]: "; cin >> alpha;
     cout << "* Enter the beta value [0-100]: ";    cin >> beta;
+    //! [basic-linear-transform-parameters]
 
     /// Do the operation new_image(i,j) = alpha*image(i,j) + beta
     /// Instead of these 'for' loops we could have used simply:
@@ -51,19 +54,15 @@ int main( int argc, char** argv )
     //! [basic-linear-transform-operation]
     for( int y = 0; y < image.rows; y++ ) {
         for( int x = 0; x < image.cols; x++ ) {
-            for( int c = 0; c < 3; c++ ) {
+            for( int c = 0; c < image.channels(); c++ ) {
                 new_image.at<Vec3b>(y,x)[c] =
-                  saturate_cast<uchar>( alpha*( image.at<Vec3b>(y,x)[c] ) + beta );
+                  saturate_cast<uchar>( alpha*image.at<Vec3b>(y,x)[c] + beta );
             }
         }
     }
     //! [basic-linear-transform-operation]
 
     //! [basic-linear-transform-display]
-    /// Create Windows
-    namedWindow("Original Image", WINDOW_AUTOSIZE);
-    namedWindow("New Image", WINDOW_AUTOSIZE);
-
     /// Show stuff
     imshow("Original Image", image);
     imshow("New Image", new_image);
diff --git a/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp b/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp
index a7dff6e92b..7c977a37b9 100644
--- a/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp
@@ -3,6 +3,8 @@
 #include "opencv2/highgui.hpp"
 
 // we're NOT "using namespace std;" here, to avoid collisions between the beta variable and std::beta in c++17
+using std::cout;
+using std::endl;
 using namespace cv;
 
 namespace
@@ -19,12 +21,13 @@ void basicLinearTransform(const Mat &img, const double alpha_, const int beta_)
     img.convertTo(res, -1, alpha_, beta_);
 
     hconcat(img, res, img_corrected);
+    imshow("Brightness and contrast adjustments", img_corrected);
 }
 
 void gammaCorrection(const Mat &img, const double gamma_)
 {
     CV_Assert(gamma_ >= 0);
-    //![changing-contrast-brightness-gamma-correction]
+    //! [changing-contrast-brightness-gamma-correction]
     Mat lookUpTable(1, 256, CV_8U);
     uchar* p = lookUpTable.ptr();
     for( int i = 0; i < 256; ++i)
@@ -32,9 +35,10 @@ void gammaCorrection(const Mat &img, const double gamma_)
 
     Mat res = img.clone();
     LUT(img, lookUpTable, res);
-    //![changing-contrast-brightness-gamma-correction]
+    //! [changing-contrast-brightness-gamma-correction]
 
     hconcat(img, res, img_gamma_corrected);
+    imshow("Gamma correction", img_gamma_corrected);
 }
 
 void on_linear_transform_alpha_trackbar(int, void *)
@@ -60,36 +64,32 @@ void on_gamma_correction_trackbar(int, void *)
 
 int main( int argc, char** argv )
 {
-
-    String imageName("../data/lena.jpg"); // by default
-    if (argc > 1)
+    CommandLineParser parser( argc, argv, "{@input | ../data/lena.jpg | input image}" );
+    img_original = imread( parser.get<String>( "@input" ) );
+    if( img_original.empty() )
     {
-        imageName = argv[1];
+      cout << "Could not open or find the image!\n" << endl;
+      cout << "Usage: " << argv[0] << " <Input image>" << endl;
+      return -1;
     }
 
-    img_original = imread( imageName );
     img_corrected = Mat(img_original.rows, img_original.cols*2, img_original.type());
     img_gamma_corrected = Mat(img_original.rows, img_original.cols*2, img_original.type());
 
     hconcat(img_original, img_original, img_corrected);
     hconcat(img_original, img_original, img_gamma_corrected);
 
-    namedWindow("Brightness and contrast adjustments", WINDOW_AUTOSIZE);
-    namedWindow("Gamma correction", WINDOW_AUTOSIZE);
+    namedWindow("Brightness and contrast adjustments");
+    namedWindow("Gamma correction");
 
     createTrackbar("Alpha gain (contrast)", "Brightness and contrast adjustments", &alpha, 500, on_linear_transform_alpha_trackbar);
     createTrackbar("Beta bias (brightness)", "Brightness and contrast adjustments", &beta, 200, on_linear_transform_beta_trackbar);
     createTrackbar("Gamma correction", "Gamma correction", &gamma_cor, 200, on_gamma_correction_trackbar);
 
-    while (true)
-    {
-        imshow("Brightness and contrast adjustments", img_corrected);
-        imshow("Gamma correction", img_gamma_corrected);
+    on_linear_transform_alpha_trackbar(0, 0);
+    on_gamma_correction_trackbar(0, 0);
 
-        int c = waitKey(30);
-        if (c == 27)
-            break;
-    }
+    waitKey();
 
     imwrite("linear_transform_correction.png", img_corrected);
     imwrite("gamma_correction.png", img_gamma_corrected);
diff --git a/samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp b/samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp
new file mode 100644
index 0000000000..f40c804cf4
--- /dev/null
+++ b/samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp
@@ -0,0 +1,180 @@
+/*  Snippet code for Operations with images tutorial (not intended to be run but should built successfully) */
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/core_c.h"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+int main(int,char**)
+{
+    std::string filename = "";
+    // Input/Output
+    {
+        //! [Load an image from a file]
+        Mat img = imread(filename);
+        //! [Load an image from a file]
+        CV_UNUSED(img);
+    }
+    {
+        //! [Load an image from a file in grayscale]
+        Mat img = imread(filename, IMREAD_GRAYSCALE);
+        //! [Load an image from a file in grayscale]
+        CV_UNUSED(img);
+    }
+    {
+        Mat img(4,4,CV_8U);
+        //! [Save image]
+        imwrite(filename, img);
+        //! [Save image]
+    }
+    // Accessing pixel intensity values
+    {
+        Mat img(4,4,CV_8U);
+        int y = 0, x = 0;
+        {
+            //! [Pixel access 1]
+            Scalar intensity = img.at<uchar>(y, x);
+            //! [Pixel access 1]
+            CV_UNUSED(intensity);
+        }
+        {
+            //! [Pixel access 2]
+            Scalar intensity = img.at<uchar>(Point(x, y));
+            //! [Pixel access 2]
+            CV_UNUSED(intensity);
+        }
+        {
+            //! [Pixel access 3]
+            Vec3b intensity = img.at<Vec3b>(y, x);
+            uchar blue = intensity.val[0];
+            uchar green = intensity.val[1];
+            uchar red = intensity.val[2];
+            //! [Pixel access 3]
+            CV_UNUSED(blue);
+            CV_UNUSED(green);
+            CV_UNUSED(red);
+        }
+        {
+            //! [Pixel access 4]
+            Vec3f intensity = img.at<Vec3f>(y, x);
+            float blue = intensity.val[0];
+            float green = intensity.val[1];
+            float red = intensity.val[2];
+            //! [Pixel access 4]
+            CV_UNUSED(blue);
+            CV_UNUSED(green);
+            CV_UNUSED(red);
+        }
+        {
+            //! [Pixel access 5]
+            img.at<uchar>(y, x) = 128;
+            //! [Pixel access 5]
+        }
+        {
+            int i = 0;
+            //! [Mat from points vector]
+            vector<Point2f> points;
+            //... fill the array
+            Mat pointsMat = Mat(points);
+            //! [Mat from points vector]
+
+            //! [Point access]
+            Point2f point = pointsMat.at<Point2f>(i, 0);
+            //! [Point access]
+            CV_UNUSED(point);
+        }
+    }
+    // Memory management and reference counting
+    {
+        //! [Reference counting 1]
+        std::vector<Point3f> points;
+        // .. fill the array
+        Mat pointsMat = Mat(points).reshape(1);
+        //! [Reference counting 1]
+        CV_UNUSED(pointsMat);
+    }
+    {
+        //! [Reference counting 2]
+        Mat img = imread("image.jpg");
+        Mat img1 = img.clone();
+        //! [Reference counting 2]
+        CV_UNUSED(img1);
+    }
+    {
+        //! [Reference counting 3]
+        Mat img = imread("image.jpg");
+        Mat sobelx;
+        Sobel(img, sobelx, CV_32F, 1, 0);
+        //! [Reference counting 3]
+    }
+    // Primitive operations
+    {
+        Mat img;
+        {
+            //! [Set image to black]
+            img = Scalar(0);
+            //! [Set image to black]
+        }
+        {
+            //! [Select ROI]
+            Rect r(10, 10, 100, 100);
+            Mat smallImg = img(r);
+            //! [Select ROI]
+            CV_UNUSED(smallImg);
+        }
+    }
+    {
+        //! [C-API conversion]
+        Mat img = imread("image.jpg");
+        IplImage img1 = img;
+        CvMat m = img;
+        //! [C-API conversion]
+        CV_UNUSED(img1);
+        CV_UNUSED(m);
+    }
+    {
+        //! [BGR to Gray]
+        Mat img = imread("image.jpg"); // loading a 8UC3 image
+        Mat grey;
+        cvtColor(img, grey, COLOR_BGR2GRAY);
+        //! [BGR to Gray]
+    }
+    {
+        Mat dst, src;
+        //! [Convert to CV_32F]
+        src.convertTo(dst, CV_32F);
+        //! [Convert to CV_32F]
+    }
+    // Visualizing images
+    {
+        //! [imshow 1]
+        Mat img = imread("image.jpg");
+        namedWindow("image", WINDOW_AUTOSIZE);
+        imshow("image", img);
+        waitKey();
+        //! [imshow 1]
+    }
+    {
+        //! [imshow 2]
+        Mat img = imread("image.jpg");
+        Mat grey;
+        cvtColor(img, grey, COLOR_BGR2GRAY);
+        Mat sobelx;
+        Sobel(grey, sobelx, CV_32F, 1, 0);
+        double minVal, maxVal;
+        minMaxLoc(sobelx, &minVal, &maxVal); //find minimum and maximum intensities
+        Mat draw;
+        sobelx.convertTo(draw, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
+        namedWindow("image", WINDOW_AUTOSIZE);
+        imshow("image", draw);
+        waitKey();
+        //! [imshow 2]
+    }
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp b/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
index 489bb88393..c194e82f24 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
@@ -21,13 +21,9 @@ double getOrientation(const vector<Point> &, Mat&);
  */
 void drawAxis(Mat& img, Point p, Point q, Scalar colour, const float scale = 0.2)
 {
-//! [visualization1]
-    double angle;
-    double hypotenuse;
-    angle = atan2( (double) p.y - q.y, (double) p.x - q.x ); // angle in radians
-    hypotenuse = sqrt( (double) (p.y - q.y) * (p.y - q.y) + (p.x - q.x) * (p.x - q.x));
-//    double degrees = angle * 180 / CV_PI; // convert radians to degrees (0-180 range)
-//    cout << "Degrees: " << abs(degrees - 180) << endl; // angle in 0-360 degrees range
+    //! [visualization1]
+    double angle = atan2( (double) p.y - q.y, (double) p.x - q.x ); // angle in radians
+    double hypotenuse = sqrt( (double) (p.y - q.y) * (p.y - q.y) + (p.x - q.x) * (p.x - q.x));
 
     // Here we lengthen the arrow by a factor of scale
     q.x = (int) (p.x - scale * hypotenuse * cos(angle));
@@ -42,7 +38,7 @@ void drawAxis(Mat& img, Point p, Point q, Scalar colour, const float scale = 0.2
     p.x = (int) (q.x + 9 * cos(angle - CV_PI / 4));
     p.y = (int) (q.y + 9 * sin(angle - CV_PI / 4));
     line(img, p, q, colour, 1, LINE_AA);
-//! [visualization1]
+    //! [visualization1]
 }
 
 /**
@@ -50,11 +46,11 @@ void drawAxis(Mat& img, Point p, Point q, Scalar colour, const float scale = 0.2
  */
 double getOrientation(const vector<Point> &pts, Mat &img)
 {
-//! [pca]
+    //! [pca]
     //Construct a buffer used by the pca analysis
     int sz = static_cast<int>(pts.size());
-    Mat data_pts = Mat(sz, 2, CV_64FC1);
-    for (int i = 0; i < data_pts.rows; ++i)
+    Mat data_pts = Mat(sz, 2, CV_64F);
+    for (int i = 0; i < data_pts.rows; i++)
     {
         data_pts.at<double>(i, 0) = pts[i].x;
         data_pts.at<double>(i, 1) = pts[i].y;
@@ -70,16 +66,16 @@ double getOrientation(const vector<Point> &pts, Mat &img)
     //Store the eigenvalues and eigenvectors
     vector<Point2d> eigen_vecs(2);
     vector<double> eigen_val(2);
-    for (int i = 0; i < 2; ++i)
+    for (int i = 0; i < 2; i++)
     {
         eigen_vecs[i] = Point2d(pca_analysis.eigenvectors.at<double>(i, 0),
                                 pca_analysis.eigenvectors.at<double>(i, 1));
 
         eigen_val[i] = pca_analysis.eigenvalues.at<double>(i);
     }
+    //! [pca]
 
-//! [pca]
-//! [visualization]
+    //! [visualization]
     // Draw the principal components
     circle(img, cntr, 3, Scalar(255, 0, 255), 2);
     Point p1 = cntr + 0.02 * Point(static_cast<int>(eigen_vecs[0].x * eigen_val[0]), static_cast<int>(eigen_vecs[0].y * eigen_val[0]));
@@ -88,7 +84,7 @@ double getOrientation(const vector<Point> &pts, Mat &img)
     drawAxis(img, cntr, p2, Scalar(255, 255, 0), 5);
 
     double angle = atan2(eigen_vecs[0].y, eigen_vecs[0].x); // orientation in radians
-//! [visualization]
+    //! [visualization]
 
     return angle;
 }
@@ -98,10 +94,10 @@ double getOrientation(const vector<Point> &pts, Mat &img)
  */
 int main(int argc, char** argv)
 {
-//! [pre-process]
+    //! [pre-process]
     // Load image
     CommandLineParser parser(argc, argv, "{@input | ../data/pca_test1.jpg | input image}");
-    parser.about( "This program demonstrates how to use OpenCV PCA to extract the orienation of an object.\n" );
+    parser.about( "This program demonstrates how to use OpenCV PCA to extract the orientation of an object.\n" );
     parser.printMessage();
 
     Mat src = imread(parser.get<String>("@input"));
@@ -122,14 +118,14 @@ int main(int argc, char** argv)
     // Convert image to binary
     Mat bw;
     threshold(gray, bw, 50, 255, THRESH_BINARY | THRESH_OTSU);
-//! [pre-process]
+    //! [pre-process]
 
-//! [contours]
+    //! [contours]
     // Find all the contours in the thresholded image
     vector<vector<Point> > contours;
     findContours(bw, contours, RETR_LIST, CHAIN_APPROX_NONE);
 
-    for (size_t i = 0; i < contours.size(); ++i)
+    for (size_t i = 0; i < contours.size(); i++)
     {
         // Calculate the area of each contour
         double area = contourArea(contours[i]);
@@ -137,14 +133,14 @@ int main(int argc, char** argv)
         if (area < 1e2 || 1e5 < area) continue;
 
         // Draw each contour only for visualisation purposes
-        drawContours(src, contours, static_cast<int>(i), Scalar(0, 0, 255), 2, LINE_8);
+        drawContours(src, contours, static_cast<int>(i), Scalar(0, 0, 255), 2);
         // Find the orientation of each shape
         getOrientation(contours[i], src);
     }
-//! [contours]
+    //! [contours]
 
     imshow("output", src);
 
-    waitKey(0);
+    waitKey();
     return 0;
 }
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
index 9b0d569c65..a5bcf98cc0 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
@@ -1,6 +1,6 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
-#include "opencv2/imgcodecs.hpp"
+#include <opencv2/imgcodecs.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/ml.hpp>
 
@@ -9,21 +9,16 @@ using namespace cv::ml;
 
 int main(int, char**)
 {
-    // Data for visual representation
-    int width = 512, height = 512;
-    Mat image = Mat::zeros(height, width, CV_8UC3);
-
     // Set up training data
     //! [setup1]
     int labels[4] = {1, -1, -1, -1};
     float trainingData[4][2] = { {501, 10}, {255, 10}, {501, 255}, {10, 501} };
     //! [setup1]
     //! [setup2]
-    Mat trainingDataMat(4, 2, CV_32FC1, trainingData);
+    Mat trainingDataMat(4, 2, CV_32F, trainingData);
     Mat labelsMat(4, 1, CV_32SC1, labels);
     //! [setup2]
 
-
     // Train the SVM
     //! [init]
     Ptr<SVM> svm = SVM::create();
@@ -35,11 +30,16 @@ int main(int, char**)
     svm->train(trainingDataMat, ROW_SAMPLE, labelsMat);
     //! [train]
 
+    // Data for visual representation
+    int width = 512, height = 512;
+    Mat image = Mat::zeros(height, width, CV_8UC3);
+
     // Show the decision regions given by the SVM
     //! [show]
-    Vec3b green(0,255,0), blue (255,0,0);
-    for (int i = 0; i < image.rows; ++i)
-        for (int j = 0; j < image.cols; ++j)
+    Vec3b green(0,255,0), blue(255,0,0);
+    for (int i = 0; i < image.rows; i++)
+    {
+        for (int j = 0; j < image.cols; j++)
         {
             Mat sampleMat = (Mat_<float>(1,2) << j,i);
             float response = svm->predict(sampleMat);
@@ -49,34 +49,33 @@ int main(int, char**)
             else if (response == -1)
                 image.at<Vec3b>(i,j)  = blue;
         }
+    }
     //! [show]
 
     // Show the training data
     //! [show_data]
     int thickness = -1;
-    int lineType = 8;
-    circle(	image, Point(501,  10), 5, Scalar(  0,   0,   0), thickness, lineType );
-    circle(	image, Point(255,  10), 5, Scalar(255, 255, 255), thickness, lineType );
-    circle(	image, Point(501, 255), 5, Scalar(255, 255, 255), thickness, lineType );
-    circle(	image, Point( 10, 501), 5, Scalar(255, 255, 255), thickness, lineType );
+    circle( image, Point(501,  10), 5, Scalar(  0,   0,   0), thickness );
+    circle( image, Point(255,  10), 5, Scalar(255, 255, 255), thickness );
+    circle( image, Point(501, 255), 5, Scalar(255, 255, 255), thickness );
+    circle( image, Point( 10, 501), 5, Scalar(255, 255, 255), thickness );
     //! [show_data]
 
     // Show support vectors
     //! [show_vectors]
     thickness = 2;
-    lineType  = 8;
     Mat sv = svm->getUncompressedSupportVectors();
 
-    for (int i = 0; i < sv.rows; ++i)
+    for (int i = 0; i < sv.rows; i++)
     {
         const float* v = sv.ptr<float>(i);
-        circle(	image,  Point( (int) v[0], (int) v[1]),   6,  Scalar(128, 128, 128), thickness, lineType);
+        circle(image,  Point( (int) v[0], (int) v[1]), 6, Scalar(128, 128, 128), thickness);
     }
     //! [show_vectors]
 
     imwrite("result.png", image);        // save the image
 
     imshow("SVM Simple Example", image); // show it to the user
-    waitKey(0);
-
+    waitKey();
+    return 0;
 }
diff --git a/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp b/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
index d046b50d0d..f8b7a373cc 100644
--- a/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+++ b/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
@@ -5,9 +5,6 @@
 #include <opencv2/highgui.hpp>
 #include <opencv2/ml.hpp>
 
-#define	NTRAINING_SAMPLES	100			// Number of training samples per class
-#define FRAC_LINEAR_SEP		0.9f	    // Fraction of samples which compose the linear separable part
-
 using namespace cv;
 using namespace cv::ml;
 using namespace std;
@@ -16,8 +13,6 @@ static void help()
 {
     cout<< "\n--------------------------------------------------------------------------" << endl
         << "This program shows Support Vector Machines for Non-Linearly Separable Data. " << endl
-        << "Usage:"                                                               << endl
-        << "./non_linear_svms" << endl
         << "--------------------------------------------------------------------------"   << endl
         << endl;
 }
@@ -26,13 +21,16 @@ int main()
 {
     help();
 
+    const int NTRAINING_SAMPLES = 100;         // Number of training samples per class
+    const float FRAC_LINEAR_SEP = 0.9f;        // Fraction of samples which compose the linear separable part
+
     // Data for visual representation
     const int WIDTH = 512, HEIGHT = 512;
     Mat I = Mat::zeros(HEIGHT, WIDTH, CV_8UC3);
 
     //--------------------- 1. Set up training data randomly ---------------------------------------
-    Mat trainData(2*NTRAINING_SAMPLES, 2, CV_32FC1);
-    Mat labels   (2*NTRAINING_SAMPLES, 1, CV_32SC1);
+    Mat trainData(2*NTRAINING_SAMPLES, 2, CV_32F);
+    Mat labels   (2*NTRAINING_SAMPLES, 1, CV_32S);
 
     RNG rng(100); // Random value generation class
 
@@ -44,10 +42,10 @@ int main()
     Mat trainClass = trainData.rowRange(0, nLinearSamples);
     // The x coordinate of the points is in [0, 0.4)
     Mat c = trainClass.colRange(0, 1);
-    rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(0.4 * WIDTH));
+    rng.fill(c, RNG::UNIFORM, Scalar(0), Scalar(0.4 * WIDTH));
     // The y coordinate of the points is in [0, 1)
     c = trainClass.colRange(1,2);
-    rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+    rng.fill(c, RNG::UNIFORM, Scalar(0), Scalar(HEIGHT));
 
     // Generate random points for the class 2
     trainClass = trainData.rowRange(2*NTRAINING_SAMPLES-nLinearSamples, 2*NTRAINING_SAMPLES);
@@ -56,26 +54,26 @@ int main()
     rng.fill(c, RNG::UNIFORM, Scalar(0.6*WIDTH), Scalar(WIDTH));
     // The y coordinate of the points is in [0, 1)
     c = trainClass.colRange(1,2);
-    rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+    rng.fill(c, RNG::UNIFORM, Scalar(0), Scalar(HEIGHT));
     //! [setup1]
 
     //------------------ Set up the non-linearly separable part of the training data ---------------
     //! [setup2]
     // Generate random points for the classes 1 and 2
-    trainClass = trainData.rowRange(  nLinearSamples, 2*NTRAINING_SAMPLES-nLinearSamples);
+    trainClass = trainData.rowRange(nLinearSamples, 2*NTRAINING_SAMPLES-nLinearSamples);
     // The x coordinate of the points is in [0.4, 0.6)
     c = trainClass.colRange(0,1);
     rng.fill(c, RNG::UNIFORM, Scalar(0.4*WIDTH), Scalar(0.6*WIDTH));
     // The y coordinate of the points is in [0, 1)
     c = trainClass.colRange(1,2);
-    rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+    rng.fill(c, RNG::UNIFORM, Scalar(0), Scalar(HEIGHT));
     //! [setup2]
+
     //------------------------- Set up the labels for the classes ---------------------------------
     labels.rowRange(                0,   NTRAINING_SAMPLES).setTo(1);  // Class 1
     labels.rowRange(NTRAINING_SAMPLES, 2*NTRAINING_SAMPLES).setTo(2);  // Class 2
 
     //------------------------ 2. Set up the support vector machines parameters --------------------
-    //------------------------ 3. Train the svm ----------------------------------------------------
     cout << "Starting training process" << endl;
     //! [init]
     Ptr<SVM> svm = SVM::create();
@@ -84,6 +82,8 @@ int main()
     svm->setKernel(SVM::LINEAR);
     svm->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, (int)1e7, 1e-6));
     //! [init]
+
+    //------------------------ 3. Train the svm ----------------------------------------------------
     //! [train]
     svm->train(trainData, ROW_SAMPLE, labels);
     //! [train]
@@ -91,53 +91,54 @@ int main()
 
     //------------------------ 4. Show the decision regions ----------------------------------------
     //! [show]
-    Vec3b green(0,100,0), blue (100,0,0);
-    for (int i = 0; i < I.rows; ++i)
-        for (int j = 0; j < I.cols; ++j)
+    Vec3b green(0,100,0), blue(100,0,0);
+    for (int i = 0; i < I.rows; i++)
+    {
+        for (int j = 0; j < I.cols; j++)
         {
-            Mat sampleMat = (Mat_<float>(1,2) << i, j);
+            Mat sampleMat = (Mat_<float>(1,2) << j, i);
             float response = svm->predict(sampleMat);
 
-            if      (response == 1)    I.at<Vec3b>(j, i)  = green;
-            else if (response == 2)    I.at<Vec3b>(j, i)  = blue;
+            if      (response == 1) I.at<Vec3b>(i,j) = green;
+            else if (response == 2) I.at<Vec3b>(i,j) = blue;
         }
+    }
     //! [show]
 
     //----------------------- 5. Show the training data --------------------------------------------
     //! [show_data]
     int thick = -1;
-    int lineType = 8;
     float px, py;
     // Class 1
-    for (int i = 0; i < NTRAINING_SAMPLES; ++i)
+    for (int i = 0; i < NTRAINING_SAMPLES; i++)
     {
         px = trainData.at<float>(i,0);
         py = trainData.at<float>(i,1);
-        circle(I, Point( (int) px,  (int) py ), 3, Scalar(0, 255, 0), thick, lineType);
+        circle(I, Point( (int) px,  (int) py ), 3, Scalar(0, 255, 0), thick);
     }
     // Class 2
-    for (int i = NTRAINING_SAMPLES; i <2*NTRAINING_SAMPLES; ++i)
+    for (int i = NTRAINING_SAMPLES; i <2*NTRAINING_SAMPLES; i++)
     {
         px = trainData.at<float>(i,0);
         py = trainData.at<float>(i,1);
-        circle(I, Point( (int) px, (int) py ), 3, Scalar(255, 0, 0), thick, lineType);
+        circle(I, Point( (int) px, (int) py ), 3, Scalar(255, 0, 0), thick);
     }
     //! [show_data]
 
     //------------------------- 6. Show support vectors --------------------------------------------
     //! [show_vectors]
     thick = 2;
-    lineType  = 8;
     Mat sv = svm->getUncompressedSupportVectors();
 
-    for (int i = 0; i < sv.rows; ++i)
+    for (int i = 0; i < sv.rows; i++)
     {
         const float* v = sv.ptr<float>(i);
-        circle(	I,  Point( (int) v[0], (int) v[1]), 6, Scalar(128, 128, 128), thick, lineType);
+        circle(I,  Point( (int) v[0], (int) v[1]), 6, Scalar(128, 128, 128), thick);
     }
     //! [show_vectors]
 
-    imwrite("result.png", I);	                   // save the Image
+    imwrite("result.png", I);                      // save the Image
     imshow("SVM for Non-Linear Training Data", I); // show it to the user
-    waitKey(0);
+    waitKey();
+    return 0;
 }
diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp
index 922bdcc9a0..161f7434f8 100644
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -22,7 +22,7 @@ const char* keys =
     "{ height      | -1 | Preprocess input image by resizing to a specific height. }"
     "{ rgb         |    | Indicate that model works with RGB input images instead BGR ones. }"
     "{ thr         | .5 | Confidence threshold. }"
-    "{ thr         | .4 | Non-maximum suppression threshold. }"
+    "{ nms         | .4 | Non-maximum suppression threshold. }"
     "{ backend     |  0 | Choose one of computation backends: "
                          "0: automatically (by default), "
                          "1: Halide language (http://halide-lang.org/), "
diff --git a/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java b/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java
new file mode 100644
index 0000000000..e55e3d9699
--- /dev/null
+++ b/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java
@@ -0,0 +1,86 @@
+import java.util.Scanner;
+
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+
+class BasicLinearTransforms {
+    private byte saturate(double val) {
+        int iVal = (int) Math.round(val);
+        iVal = iVal > 255 ? 255 : (iVal < 0 ? 0 : iVal);
+        return (byte) iVal;
+    }
+
+    public void run(String[] args) {
+        /// Read image given by user
+        //! [basic-linear-transform-load]
+        String imagePath = args.length > 0 ? args[0] : "../data/lena.jpg";
+        Mat image = Imgcodecs.imread(imagePath);
+        if (image.empty()) {
+            System.out.println("Empty image: " + imagePath);
+            System.exit(0);
+        }
+        //! [basic-linear-transform-load]
+
+        //! [basic-linear-transform-output]
+        Mat newImage = Mat.zeros(image.size(), image.type());
+        //! [basic-linear-transform-output]
+
+        //! [basic-linear-transform-parameters]
+        double alpha = 1.0; /*< Simple contrast control */
+        int beta = 0;       /*< Simple brightness control */
+
+        /// Initialize values
+        System.out.println(" Basic Linear Transforms ");
+        System.out.println("-------------------------");
+        try (Scanner scanner = new Scanner(System.in)) {
+            System.out.print("* Enter the alpha value [1.0-3.0]: ");
+            alpha = scanner.nextDouble();
+            System.out.print("* Enter the beta value [0-100]: ");
+            beta = scanner.nextInt();
+        }
+        //! [basic-linear-transform-parameters]
+
+        /// Do the operation newImage(i,j) = alpha*image(i,j) + beta
+        /// Instead of these 'for' loops we could have used simply:
+        /// image.convertTo(newImage, -1, alpha, beta);
+        /// but we wanted to show you how to access the pixels :)
+        //! [basic-linear-transform-operation]
+        byte[] imageData = new byte[(int) (image.total()*image.channels())];
+        image.get(0, 0, imageData);
+        byte[] newImageData = new byte[(int) (newImage.total()*newImage.channels())];
+        for (int y = 0; y < image.rows(); y++) {
+            for (int x = 0; x < image.cols(); x++) {
+                for (int c = 0; c < image.channels(); c++) {
+                    double pixelValue = imageData[(y * image.cols() + x) * image.channels() + c];
+                    /// Java byte range is [-128, 127]
+                    pixelValue = pixelValue < 0 ? pixelValue + 256 : pixelValue;
+                    newImageData[(y * image.cols() + x) * image.channels() + c]
+                            = saturate(alpha * pixelValue + beta);
+                }
+            }
+        }
+        newImage.put(0, 0, newImageData);
+        //! [basic-linear-transform-operation]
+
+        //! [basic-linear-transform-display]
+        /// Show stuff
+        HighGui.imshow("Original Image", image);
+        HighGui.imshow("New Image", newImage);
+
+        /// Wait until user press some key
+        HighGui.waitKey();
+        //! [basic-linear-transform-display]
+        System.exit(0);
+    }
+}
+
+public class BasicLinearTransformsDemo {
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        new BasicLinearTransforms().run(args);
+    }
+}
diff --git a/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java b/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java
new file mode 100644
index 0000000000..62bdf38b7e
--- /dev/null
+++ b/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java
@@ -0,0 +1,202 @@
+import java.awt.BorderLayout;
+import java.awt.Container;
+import java.awt.Image;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+
+import javax.swing.BoxLayout;
+import javax.swing.ImageIcon;
+import javax.swing.JCheckBox;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JSlider;
+import javax.swing.event.ChangeEvent;
+import javax.swing.event.ChangeListener;
+
+import org.opencv.core.Core;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+
+class ChangingContrastBrightnessImage {
+    private static int MAX_VALUE_ALPHA = 500;
+    private static int MAX_VALUE_BETA_GAMMA = 200;
+    private static final String WINDOW_NAME = "Changing the contrast and brightness of an image demo";
+    private static final String ALPHA_NAME = "Alpha gain (contrast)";
+    private static final String BETA_NAME = "Beta bias (brightness)";
+    private static final String GAMMA_NAME = "Gamma correction";
+    private JFrame frame;
+    private Mat matImgSrc = new Mat();
+    private JLabel imgSrcLabel;
+    private JLabel imgModifLabel;
+    private JPanel controlPanel;
+    private JPanel alphaBetaPanel;
+    private JPanel gammaPanel;
+    private double alphaValue = 1.0;
+    private double betaValue = 0.0;
+    private double gammaValue = 1.0;
+    private JCheckBox methodCheckBox;
+    private JSlider sliderAlpha;
+    private JSlider sliderBeta;
+    private JSlider sliderGamma;
+
+    public ChangingContrastBrightnessImage(String[] args) {
+        String imagePath = args.length > 0 ? args[0] : "../data/lena.jpg";
+        matImgSrc = Imgcodecs.imread(imagePath);
+        if (matImgSrc.empty()) {
+            System.out.println("Empty image: " + imagePath);
+            System.exit(0);
+        }
+
+        // Create and set up the window.
+        frame = new JFrame(WINDOW_NAME);
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        // Set up the content pane.
+        Image img = HighGui.toBufferedImage(matImgSrc);
+        addComponentsToPane(frame.getContentPane(), img);
+        // Use the content pane's default BorderLayout. No need for
+        // setLayout(new BorderLayout());
+        // Display the window.
+        frame.pack();
+        frame.setVisible(true);
+    }
+
+    private void addComponentsToPane(Container pane, Image img) {
+        if (!(pane.getLayout() instanceof BorderLayout)) {
+            pane.add(new JLabel("Container doesn't use BorderLayout!"));
+            return;
+        }
+
+        controlPanel = new JPanel();
+        controlPanel.setLayout(new BoxLayout(controlPanel, BoxLayout.PAGE_AXIS));
+
+        methodCheckBox = new JCheckBox("Do gamma correction");
+        methodCheckBox.addActionListener(new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent e) {
+                JCheckBox cb = (JCheckBox) e.getSource();
+                if (cb.isSelected()) {
+                    controlPanel.remove(alphaBetaPanel);
+                    controlPanel.add(gammaPanel);
+                    performGammaCorrection();
+                    frame.revalidate();
+                    frame.repaint();
+                    frame.pack();
+                } else {
+                    controlPanel.remove(gammaPanel);
+                    controlPanel.add(alphaBetaPanel);
+                    performLinearTransformation();
+                    frame.revalidate();
+                    frame.repaint();
+                    frame.pack();
+                }
+            }
+        });
+        controlPanel.add(methodCheckBox);
+
+        alphaBetaPanel = new JPanel();
+        alphaBetaPanel.setLayout(new BoxLayout(alphaBetaPanel, BoxLayout.PAGE_AXIS));
+        alphaBetaPanel.add(new JLabel(ALPHA_NAME));
+        sliderAlpha = new JSlider(0, MAX_VALUE_ALPHA, 100);
+        sliderAlpha.setMajorTickSpacing(50);
+        sliderAlpha.setMinorTickSpacing(10);
+        sliderAlpha.setPaintTicks(true);
+        sliderAlpha.setPaintLabels(true);
+        sliderAlpha.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                alphaValue = sliderAlpha.getValue() / 100.0;
+                performLinearTransformation();
+            }
+        });
+        alphaBetaPanel.add(sliderAlpha);
+
+        alphaBetaPanel.add(new JLabel(BETA_NAME));
+        sliderBeta = new JSlider(0, MAX_VALUE_BETA_GAMMA, 100);
+        sliderBeta.setMajorTickSpacing(20);
+        sliderBeta.setMinorTickSpacing(5);
+        sliderBeta.setPaintTicks(true);
+        sliderBeta.setPaintLabels(true);
+        sliderBeta.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                betaValue = sliderBeta.getValue() - 100;
+                performLinearTransformation();
+            }
+        });
+        alphaBetaPanel.add(sliderBeta);
+        controlPanel.add(alphaBetaPanel);
+
+        gammaPanel = new JPanel();
+        gammaPanel.setLayout(new BoxLayout(gammaPanel, BoxLayout.PAGE_AXIS));
+        gammaPanel.add(new JLabel(GAMMA_NAME));
+        sliderGamma = new JSlider(0, MAX_VALUE_BETA_GAMMA, 100);
+        sliderGamma.setMajorTickSpacing(20);
+        sliderGamma.setMinorTickSpacing(5);
+        sliderGamma.setPaintTicks(true);
+        sliderGamma.setPaintLabels(true);
+        sliderGamma.addChangeListener(new ChangeListener() {
+            @Override
+            public void stateChanged(ChangeEvent e) {
+                gammaValue = sliderGamma.getValue() / 100.0;
+                performGammaCorrection();
+            }
+        });
+        gammaPanel.add(sliderGamma);
+
+        pane.add(controlPanel, BorderLayout.PAGE_START);
+        JPanel framePanel = new JPanel();
+        imgSrcLabel = new JLabel(new ImageIcon(img));
+        framePanel.add(imgSrcLabel);
+        imgModifLabel = new JLabel(new ImageIcon(img));
+        framePanel.add(imgModifLabel);
+        pane.add(framePanel, BorderLayout.CENTER);
+    }
+
+    private void performLinearTransformation() {
+        Mat img = new Mat();
+        matImgSrc.convertTo(img, -1, alphaValue, betaValue);
+        imgModifLabel.setIcon(new ImageIcon(HighGui.toBufferedImage(img)));
+        frame.repaint();
+    }
+
+    private byte saturate(double val) {
+        int iVal = (int) Math.round(val);
+        iVal = iVal > 255 ? 255 : (iVal < 0 ? 0 : iVal);
+        return (byte) iVal;
+    }
+
+    private void performGammaCorrection() {
+        //! [changing-contrast-brightness-gamma-correction]
+        Mat lookUpTable = new Mat(1, 256, CvType.CV_8U);
+        byte[] lookUpTableData = new byte[(int) (lookUpTable.total()*lookUpTable.channels())];
+        for (int i = 0; i < lookUpTable.cols(); i++) {
+            lookUpTableData[i] = saturate(Math.pow(i / 255.0, gammaValue) * 255.0);
+        }
+        lookUpTable.put(0, 0, lookUpTableData);
+        Mat img = new Mat();
+        Core.LUT(matImgSrc, lookUpTable, img);
+        //! [changing-contrast-brightness-gamma-correction]
+
+        imgModifLabel.setIcon(new ImageIcon(HighGui.toBufferedImage(img)));
+        frame.repaint();
+    }
+}
+
+public class ChangingContrastBrightnessImageDemo {
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        // Schedule a job for the event dispatch thread:
+        // creating and showing this application's GUI.
+        javax.swing.SwingUtilities.invokeLater(new Runnable() {
+            @Override
+            public void run() {
+                new ChangingContrastBrightnessImage(args);
+            }
+        });
+    }
+}
diff --git a/samples/java/tutorial_code/core/mat_operations/MatOperations.java b/samples/java/tutorial_code/core/mat_operations/MatOperations.java
new file mode 100644
index 0000000000..e5fe7cdd78
--- /dev/null
+++ b/samples/java/tutorial_code/core/mat_operations/MatOperations.java
@@ -0,0 +1,130 @@
+import java.util.Arrays;
+
+import org.opencv.core.Core;
+import org.opencv.core.Core.MinMaxLocResult;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.core.Rect;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+
+public class MatOperations {
+    @SuppressWarnings("unused")
+    public static void main(String[] args) {
+        /*  Snippet code for Operations with images tutorial (not intended to be run) */
+
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        String filename = "";
+        // Input/Output
+        {
+            //! [Load an image from a file]
+            Mat img = Imgcodecs.imread(filename);
+            //! [Load an image from a file]
+        }
+        {
+            //! [Load an image from a file in grayscale]
+            Mat img = Imgcodecs.imread(filename, Imgcodecs.IMREAD_GRAYSCALE);
+            //! [Load an image from a file in grayscale]
+        }
+        {
+            Mat img = new Mat(4, 4, CvType.CV_8U);
+            //! [Save image]
+            Imgcodecs.imwrite(filename, img);
+            //! [Save image]
+        }
+        // Accessing pixel intensity values
+        {
+            Mat img = new Mat(4, 4, CvType.CV_8U);
+            int y = 0, x = 0;
+            {
+                //! [Pixel access 1]
+                byte[] imgData = new byte[(int) (img.total() * img.channels())];
+                img.get(0, 0, imgData);
+                byte intensity = imgData[y * img.cols() + x];
+                //! [Pixel access 1]
+            }
+            {
+                //! [Pixel access 5]
+                byte[] imgData = new byte[(int) (img.total() * img.channels())];
+                imgData[y * img.cols() + x] = (byte) 128;
+                img.put(0, 0, imgData);
+                //! [Pixel access 5]
+            }
+
+        }
+        // Memory management and reference counting
+        {
+            //! [Reference counting 2]
+            Mat img = Imgcodecs.imread("image.jpg");
+            Mat img1 = img.clone();
+            //! [Reference counting 2]
+        }
+        {
+            //! [Reference counting 3]
+            Mat img = Imgcodecs.imread("image.jpg");
+            Mat sobelx = new Mat();
+            Imgproc.Sobel(img, sobelx, CvType.CV_32F, 1, 0);
+            //! [Reference counting 3]
+        }
+        // Primitive operations
+        {
+            Mat img = new Mat(400, 400, CvType.CV_8UC3);
+            {
+                //! [Set image to black]
+                byte[] imgData = new byte[(int) (img.total() * img.channels())];
+                Arrays.fill(imgData, (byte) 0);
+                img.put(0, 0, imgData);
+                //! [Set image to black]
+            }
+            {
+                //! [Select ROI]
+                Rect r = new Rect(10, 10, 100, 100);
+                Mat smallImg = img.submat(r);
+                //! [Select ROI]
+            }
+        }
+        {
+            //! [BGR to Gray]
+            Mat img = Imgcodecs.imread("image.jpg"); // loading a 8UC3 image
+            Mat grey = new Mat();
+            Imgproc.cvtColor(img, grey, Imgproc.COLOR_BGR2GRAY);
+            //! [BGR to Gray]
+        }
+        {
+            Mat dst = new Mat(), src = new Mat();
+            //! [Convert to CV_32F]
+            src.convertTo(dst, CvType.CV_32F);
+            //! [Convert to CV_32F]
+        }
+        // Visualizing images
+        {
+            //! [imshow 1]
+            Mat img = Imgcodecs.imread("image.jpg");
+            HighGui.namedWindow("image", HighGui.WINDOW_AUTOSIZE);
+            HighGui.imshow("image", img);
+            HighGui.waitKey();
+            //! [imshow 1]
+        }
+        {
+            //! [imshow 2]
+            Mat img = Imgcodecs.imread("image.jpg");
+            Mat grey = new Mat();
+            Imgproc.cvtColor(img, grey, Imgproc.COLOR_BGR2GRAY);
+            Mat sobelx = new Mat();
+            Imgproc.Sobel(grey, sobelx, CvType.CV_32F, 1, 0);
+            MinMaxLocResult res = Core.minMaxLoc(sobelx); // find minimum and maximum intensities
+            Mat draw = new Mat();
+            double maxVal = res.maxVal, minVal = res.minVal;
+            sobelx.convertTo(draw, CvType.CV_8U, 255.0 / (maxVal - minVal), -minVal * 255.0 / (maxVal - minVal));
+            HighGui.namedWindow("image", HighGui.WINDOW_AUTOSIZE);
+            HighGui.imshow("image", draw);
+            HighGui.waitKey();
+            //! [imshow 2]
+        }
+        System.exit(0);
+    }
+
+}
diff --git a/samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java b/samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java
new file mode 100644
index 0000000000..5dfef53efa
--- /dev/null
+++ b/samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java
@@ -0,0 +1,144 @@
+import java.util.ArrayList;
+import java.util.List;
+
+import org.opencv.core.Core;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfPoint;
+import org.opencv.core.Point;
+import org.opencv.core.Scalar;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+
+//This program demonstrates how to use OpenCV PCA to extract the orientation of an object.
+class IntroductionToPCA {
+    private void drawAxis(Mat img, Point p_, Point q_, Scalar colour, float scale) {
+        Point p = new Point(p_.x, p_.y);
+        Point q = new Point(q_.x, q_.y);
+        //! [visualization1]
+        double angle = Math.atan2(p.y - q.y, p.x - q.x); // angle in radians
+        double hypotenuse = Math.sqrt((p.y - q.y) * (p.y - q.y) + (p.x - q.x) * (p.x - q.x));
+
+        // Here we lengthen the arrow by a factor of scale
+        q.x = (int) (p.x - scale * hypotenuse * Math.cos(angle));
+        q.y = (int) (p.y - scale * hypotenuse * Math.sin(angle));
+        Imgproc.line(img, p, q, colour, 1, Core.LINE_AA, 0);
+
+        // create the arrow hooks
+        p.x = (int) (q.x + 9 * Math.cos(angle + Math.PI / 4));
+        p.y = (int) (q.y + 9 * Math.sin(angle + Math.PI / 4));
+        Imgproc.line(img, p, q, colour, 1, Core.LINE_AA, 0);
+
+        p.x = (int) (q.x + 9 * Math.cos(angle - Math.PI / 4));
+        p.y = (int) (q.y + 9 * Math.sin(angle - Math.PI / 4));
+        Imgproc.line(img, p, q, colour, 1, Core.LINE_AA, 0);
+        //! [visualization1]
+    }
+
+    private double getOrientation(MatOfPoint ptsMat, Mat img) {
+        List<Point> pts = ptsMat.toList();
+        //! [pca]
+        // Construct a buffer used by the pca analysis
+        int sz = pts.size();
+        Mat dataPts = new Mat(sz, 2, CvType.CV_64F);
+        double[] dataPtsData = new double[(int) (dataPts.total() * dataPts.channels())];
+        for (int i = 0; i < dataPts.rows(); i++) {
+            dataPtsData[i * dataPts.cols()] = pts.get(i).x;
+            dataPtsData[i * dataPts.cols() + 1] = pts.get(i).y;
+        }
+        dataPts.put(0, 0, dataPtsData);
+
+        // Perform PCA analysis
+        Mat mean = new Mat();
+        Mat eigenvectors = new Mat();
+        Mat eigenvalues = new Mat();
+        Core.PCACompute2(dataPts, mean, eigenvectors, eigenvalues);
+        double[] meanData = new double[(int) (mean.total() * mean.channels())];
+        mean.get(0, 0, meanData);
+
+        // Store the center of the object
+        Point cntr = new Point(meanData[0], meanData[1]);
+
+        // Store the eigenvalues and eigenvectors
+        double[] eigenvectorsData = new double[(int) (eigenvectors.total() * eigenvectors.channels())];
+        double[] eigenvaluesData = new double[(int) (eigenvalues.total() * eigenvalues.channels())];
+        eigenvectors.get(0, 0, eigenvectorsData);
+        eigenvalues.get(0, 0, eigenvaluesData);
+        //! [pca]
+
+        //! [visualization]
+        // Draw the principal components
+        Imgproc.circle(img, cntr, 3, new Scalar(255, 0, 255), 2);
+        Point p1 = new Point(cntr.x + 0.02 * eigenvectorsData[0] * eigenvaluesData[0],
+                cntr.y + 0.02 * eigenvectorsData[1] * eigenvaluesData[0]);
+        Point p2 = new Point(cntr.x - 0.02 * eigenvectorsData[2] * eigenvaluesData[1],
+                cntr.y - 0.02 * eigenvectorsData[3] * eigenvaluesData[1]);
+        drawAxis(img, cntr, p1, new Scalar(0, 255, 0), 1);
+        drawAxis(img, cntr, p2, new Scalar(255, 255, 0), 5);
+
+        double angle = Math.atan2(eigenvectorsData[1], eigenvectorsData[0]); // orientation in radians
+        //! [visualization]
+
+        return angle;
+    }
+
+    public void run(String[] args) {
+        //! [pre-process]
+        // Load image
+        String filename = args.length > 0 ? args[0] : "../data/pca_test1.jpg";
+        Mat src = Imgcodecs.imread(filename);
+
+        // Check if image is loaded successfully
+        if (src.empty()) {
+            System.err.println("Cannot read image: " + filename);
+            System.exit(0);
+        }
+
+        Mat srcOriginal = src.clone();
+        HighGui.imshow("src", srcOriginal);
+
+        // Convert image to grayscale
+        Mat gray = new Mat();
+        Imgproc.cvtColor(src, gray, Imgproc.COLOR_BGR2GRAY);
+
+        // Convert image to binary
+        Mat bw = new Mat();
+        Imgproc.threshold(gray, bw, 50, 255, Imgproc.THRESH_BINARY | Imgproc.THRESH_OTSU);
+        //! [pre-process]
+
+        //! [contours]
+        // Find all the contours in the thresholded image
+        List<MatOfPoint> contours = new ArrayList<>();
+        Mat hierarchy = new Mat();
+        Imgproc.findContours(bw, contours, hierarchy, Imgproc.RETR_LIST, Imgproc.CHAIN_APPROX_NONE);
+
+        for (int i = 0; i < contours.size(); i++) {
+            // Calculate the area of each contour
+            double area = Imgproc.contourArea(contours.get(i));
+            // Ignore contours that are too small or too large
+            if (area < 1e2 || 1e5 < area)
+                continue;
+
+            // Draw each contour only for visualisation purposes
+            Imgproc.drawContours(src, contours, i, new Scalar(0, 0, 255), 2);
+            // Find the orientation of each shape
+            getOrientation(contours.get(i), src);
+        }
+        //! [contours]
+
+        HighGui.imshow("output", src);
+
+        HighGui.waitKey();
+        System.exit(0);
+    }
+}
+
+public class IntroductionToPCADemo {
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        new IntroductionToPCA().run(args);
+    }
+}
diff --git a/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java b/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java
new file mode 100644
index 0000000000..c44483f2cf
--- /dev/null
+++ b/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java
@@ -0,0 +1,99 @@
+import org.opencv.core.Core;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.core.Point;
+import org.opencv.core.Scalar;
+import org.opencv.core.TermCriteria;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.ml.Ml;
+import org.opencv.ml.SVM;
+
+public class IntroductionToSVMDemo {
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        // Set up training data
+        //! [setup1]
+        int[] labels = { 1, -1, -1, -1 };
+        float[] trainingData = { 501, 10, 255, 10, 501, 255, 10, 501 };
+        //! [setup1]
+        //! [setup2]
+        Mat trainingDataMat = new Mat(4, 2, CvType.CV_32FC1);
+        trainingDataMat.put(0, 0, trainingData);
+        Mat labelsMat = new Mat(4, 1, CvType.CV_32SC1);
+        labelsMat.put(0, 0, labels);
+        //! [setup2]
+
+        // Train the SVM
+        //! [init]
+        SVM svm = SVM.create();
+        svm.setType(SVM.C_SVC);
+        svm.setKernel(SVM.LINEAR);
+        svm.setTermCriteria(new TermCriteria(TermCriteria.MAX_ITER, 100, 1e-6));
+        //! [init]
+        //! [train]
+        svm.train(trainingDataMat, Ml.ROW_SAMPLE, labelsMat);
+        //! [train]
+
+        // Data for visual representation
+        int width = 512, height = 512;
+        Mat image = Mat.zeros(height, width, CvType.CV_8UC3);
+
+        // Show the decision regions given by the SVM
+        //! [show]
+        byte[] imageData = new byte[(int) (image.total() * image.channels())];
+        Mat sampleMat = new Mat(1, 2, CvType.CV_32F);
+        float[] sampleMatData = new float[(int) (sampleMat.total() * sampleMat.channels())];
+        for (int i = 0; i < image.rows(); i++) {
+            for (int j = 0; j < image.cols(); j++) {
+                sampleMatData[0] = j;
+                sampleMatData[1] = i;
+                sampleMat.put(0, 0, sampleMatData);
+                float response = svm.predict(sampleMat);
+
+                if (response == 1) {
+                    imageData[(i * image.cols() + j) * image.channels()] = 0;
+                    imageData[(i * image.cols() + j) * image.channels() + 1] = (byte) 255;
+                    imageData[(i * image.cols() + j) * image.channels() + 2] = 0;
+                } else if (response == -1) {
+                    imageData[(i * image.cols() + j) * image.channels()] = (byte) 255;
+                    imageData[(i * image.cols() + j) * image.channels() + 1] = 0;
+                    imageData[(i * image.cols() + j) * image.channels() + 2] = 0;
+                }
+            }
+        }
+        image.put(0, 0, imageData);
+        //! [show]
+
+        // Show the training data
+        //! [show_data]
+        int thickness = -1;
+        int lineType = Core.LINE_8;
+        Imgproc.circle(image, new Point(501, 10), 5, new Scalar(0, 0, 0), thickness, lineType, 0);
+        Imgproc.circle(image, new Point(255, 10), 5, new Scalar(255, 255, 255), thickness, lineType, 0);
+        Imgproc.circle(image, new Point(501, 255), 5, new Scalar(255, 255, 255), thickness, lineType, 0);
+        Imgproc.circle(image, new Point(10, 501), 5, new Scalar(255, 255, 255), thickness, lineType, 0);
+        //! [show_data]
+
+        // Show support vectors
+        //! [show_vectors]
+        thickness = 2;
+        Mat sv = svm.getUncompressedSupportVectors();
+        float[] svData = new float[(int) (sv.total() * sv.channels())];
+        sv.get(0, 0, svData);
+        for (int i = 0; i < sv.rows(); ++i) {
+            Imgproc.circle(image, new Point(svData[i * sv.cols()], svData[i * sv.cols() + 1]), 6,
+                    new Scalar(128, 128, 128), thickness, lineType, 0);
+        }
+        //! [show_vectors]
+
+        Imgcodecs.imwrite("result.png", image); // save the image
+
+        HighGui.imshow("SVM Simple Example", image); // show it to the user
+        HighGui.waitKey();
+        System.exit(0);
+    }
+}
diff --git a/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java b/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java
new file mode 100644
index 0000000000..798c1fc3ef
--- /dev/null
+++ b/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java
@@ -0,0 +1,186 @@
+import java.util.Random;
+
+import org.opencv.core.Core;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.core.Point;
+import org.opencv.core.Scalar;
+import org.opencv.core.TermCriteria;
+import org.opencv.highgui.HighGui;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.ml.Ml;
+import org.opencv.ml.SVM;
+
+public class NonLinearSVMsDemo {
+    public static final int NTRAINING_SAMPLES = 100;
+    public static final float FRAC_LINEAR_SEP = 0.9f;
+
+    public static void main(String[] args) {
+        // Load the native OpenCV library
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+
+        System.out.println("\n--------------------------------------------------------------------------");
+        System.out.println("This program shows Support Vector Machines for Non-Linearly Separable Data. ");
+        System.out.println("--------------------------------------------------------------------------\n");
+
+        // Data for visual representation
+        int width = 512, height = 512;
+        Mat I = Mat.zeros(height, width, CvType.CV_8UC3);
+
+        // --------------------- 1. Set up training data randomly---------------------------------------
+        Mat trainData = new Mat(2 * NTRAINING_SAMPLES, 2, CvType.CV_32F);
+        Mat labels = new Mat(2 * NTRAINING_SAMPLES, 1, CvType.CV_32S);
+
+        Random rng = new Random(100); // Random value generation class
+
+        // Set up the linearly separable part of the training data
+        int nLinearSamples = (int) (FRAC_LINEAR_SEP * NTRAINING_SAMPLES);
+
+        //! [setup1]
+        // Generate random points for the class 1
+        Mat trainClass = trainData.rowRange(0, nLinearSamples);
+        // The x coordinate of the points is in [0, 0.4)
+        Mat c = trainClass.colRange(0, 1);
+        float[] cData = new float[(int) (c.total() * c.channels())];
+        double[] cDataDbl = rng.doubles(cData.length, 0, 0.4f * width).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+        // The y coordinate of the points is in [0, 1)
+        c = trainClass.colRange(1, 2);
+        cData = new float[(int) (c.total() * c.channels())];
+        cDataDbl = rng.doubles(cData.length, 0, height).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+
+        // Generate random points for the class 2
+        trainClass = trainData.rowRange(2 * NTRAINING_SAMPLES - nLinearSamples, 2 * NTRAINING_SAMPLES);
+        // The x coordinate of the points is in [0.6, 1]
+        c = trainClass.colRange(0, 1);
+        cData = new float[(int) (c.total() * c.channels())];
+        cDataDbl = rng.doubles(cData.length, 0.6 * width, width).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+        // The y coordinate of the points is in [0, 1)
+        c = trainClass.colRange(1, 2);
+        cData = new float[(int) (c.total() * c.channels())];
+        cDataDbl = rng.doubles(cData.length, 0, height).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+        //! [setup1]
+
+        // ------------------ Set up the non-linearly separable part of the training data ---------------
+        //! [setup2]
+        // Generate random points for the classes 1 and 2
+        trainClass = trainData.rowRange(nLinearSamples, 2 * NTRAINING_SAMPLES - nLinearSamples);
+        // The x coordinate of the points is in [0.4, 0.6)
+        c = trainClass.colRange(0, 1);
+        cData = new float[(int) (c.total() * c.channels())];
+        cDataDbl = rng.doubles(cData.length, 0.4 * width, 0.6 * width).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+        // The y coordinate of the points is in [0, 1)
+        c = trainClass.colRange(1, 2);
+        cData = new float[(int) (c.total() * c.channels())];
+        cDataDbl = rng.doubles(cData.length, 0, height).toArray();
+        for (int i = 0; i < cData.length; i++) {
+            cData[i] = (float) cDataDbl[i];
+        }
+        c.put(0, 0, cData);
+        //! [setup2]
+
+        // ------------------------- Set up the labels for the classes---------------------------------
+        labels.rowRange(0, NTRAINING_SAMPLES).setTo(new Scalar(1)); // Class 1
+        labels.rowRange(NTRAINING_SAMPLES, 2 * NTRAINING_SAMPLES).setTo(new Scalar(2)); // Class 2
+
+        // ------------------------ 2. Set up the support vector machines parameters--------------------
+        System.out.println("Starting training process");
+        //! [init]
+        SVM svm = SVM.create();
+        svm.setType(SVM.C_SVC);
+        svm.setC(0.1);
+        svm.setKernel(SVM.LINEAR);
+        svm.setTermCriteria(new TermCriteria(TermCriteria.MAX_ITER, (int) 1e7, 1e-6));
+        //! [init]
+
+        // ------------------------ 3. Train the svm----------------------------------------------------
+        //! [train]
+        svm.train(trainData, Ml.ROW_SAMPLE, labels);
+        //! [train]
+        System.out.println("Finished training process");
+
+        // ------------------------ 4. Show the decision regions----------------------------------------
+        //! [show]
+        byte[] IData = new byte[(int) (I.total() * I.channels())];
+        Mat sampleMat = new Mat(1, 2, CvType.CV_32F);
+        float[] sampleMatData = new float[(int) (sampleMat.total() * sampleMat.channels())];
+        for (int i = 0; i < I.rows(); i++) {
+            for (int j = 0; j < I.cols(); j++) {
+                sampleMatData[0] = j;
+                sampleMatData[1] = i;
+                sampleMat.put(0, 0, sampleMatData);
+                float response = svm.predict(sampleMat);
+
+                if (response == 1) {
+                    IData[(i * I.cols() + j) * I.channels()] = 0;
+                    IData[(i * I.cols() + j) * I.channels() + 1] = 100;
+                    IData[(i * I.cols() + j) * I.channels() + 2] = 0;
+                } else if (response == 2) {
+                    IData[(i * I.cols() + j) * I.channels()] = 100;
+                    IData[(i * I.cols() + j) * I.channels() + 1] = 0;
+                    IData[(i * I.cols() + j) * I.channels() + 2] = 0;
+                }
+            }
+        }
+        I.put(0, 0, IData);
+        //! [show]
+
+        // ----------------------- 5. Show the training data--------------------------------------------
+        //! [show_data]
+        int thick = -1;
+        int lineType = Core.LINE_8;
+        float px, py;
+        // Class 1
+        float[] trainDataData = new float[(int) (trainData.total() * trainData.channels())];
+        trainData.get(0, 0, trainDataData);
+        for (int i = 0; i < NTRAINING_SAMPLES; i++) {
+            px = trainDataData[i * trainData.cols()];
+            py = trainDataData[i * trainData.cols() + 1];
+            Imgproc.circle(I, new Point(px, py), 3, new Scalar(0, 255, 0), thick, lineType, 0);
+        }
+        // Class 2
+        for (int i = NTRAINING_SAMPLES; i < 2 * NTRAINING_SAMPLES; ++i) {
+            px = trainDataData[i * trainData.cols()];
+            py = trainDataData[i * trainData.cols() + 1];
+            Imgproc.circle(I, new Point(px, py), 3, new Scalar(255, 0, 0), thick, lineType, 0);
+        }
+        //! [show_data]
+
+        // ------------------------- 6. Show support vectors--------------------------------------------
+        //! [show_vectors]
+        thick = 2;
+        Mat sv = svm.getUncompressedSupportVectors();
+        float[] svData = new float[(int) (sv.total() * sv.channels())];
+        sv.get(0, 0, svData);
+        for (int i = 0; i < sv.rows(); i++) {
+            Imgproc.circle(I, new Point(svData[i * sv.cols()], svData[i * sv.cols() + 1]), 6, new Scalar(128, 128, 128),
+                    thick, lineType, 0);
+        }
+        //! [show_vectors]
+
+        Imgcodecs.imwrite("result.png", I); // save the Image
+        HighGui.imshow("SVM for Non-Linear Training Data", I); // show it to the user
+        HighGui.waitKey();
+        System.exit(0);
+    }
+}
diff --git a/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py b/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py
index 060167484c..fb5d68ac07 100644
--- a/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py
+++ b/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py
@@ -25,8 +25,8 @@ def thresh_callback(val):
     boundRect = [None]*len(contours)
     centers = [None]*len(contours)
     radius = [None]*len(contours)
-    for i in range(len(contours)):
-        contours_poly[i] = cv.approxPolyDP(contours[i], 3, True)
+    for i, c in enumerate(contours):
+        contours_poly[i] = cv.approxPolyDP(c, 3, True)
         boundRect[i] = cv.boundingRect(contours_poly[i])
         centers[i], radius[i] = cv.minEnclosingCircle(contours_poly[i])
     ## [allthework]
diff --git a/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py b/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py
index a461aba49b..16787718f6 100644
--- a/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py
+++ b/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py
@@ -22,22 +22,22 @@ def thresh_callback(val):
     # Find the rotated rectangles and ellipses for each contour
     minRect = [None]*len(contours)
     minEllipse = [None]*len(contours)
-    for i in range(len(contours)):
-        minRect[i] = cv.minAreaRect(contours[i])
-        if contours[i].shape[0] > 5:
-            minEllipse[i] = cv.fitEllipse(contours[i])
+    for i, c in enumerate(contours):
+        minRect[i] = cv.minAreaRect(c)
+        if c.shape[0] > 5:
+            minEllipse[i] = cv.fitEllipse(c)
 
     # Draw contours + rotated rects + ellipses
     ## [zeroMat]
     drawing = np.zeros((canny_output.shape[0], canny_output.shape[1], 3), dtype=np.uint8)
     ## [zeroMat]
     ## [forContour]
-    for i in range(len(contours)):
+    for i, c in enumerate(contours):
         color = (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256))
         # contour
         cv.drawContours(drawing, contours, i, color)
         # ellipse
-        if contours[i].shape[0] > 5:
+        if c.shape[0] > 5:
             cv.ellipse(drawing, minEllipse[i], color, 2)
         # rotated rectangle
         box = cv.boxPoints(minRect[i])
diff --git a/samples/python/tutorial_code/core/mat_operations/mat_operations.py b/samples/python/tutorial_code/core/mat_operations/mat_operations.py
new file mode 100644
index 0000000000..e9ec03699d
--- /dev/null
+++ b/samples/python/tutorial_code/core/mat_operations/mat_operations.py
@@ -0,0 +1,92 @@
+from __future__ import division
+import cv2 as cv
+import numpy as np
+
+# Snippet code for Operations with images tutorial (not intended to be run)
+
+def load():
+    # Input/Output
+    filename = 'img.jpg'
+    ## [Load an image from a file]
+    img = cv.imread(filename)
+    ## [Load an image from a file]
+
+    ## [Load an image from a file in grayscale]
+    img = cv.imread(filename, cv.IMREAD_GRAYSCALE)
+    ## [Load an image from a file in grayscale]
+
+    ## [Save image]
+    cv.imwrite(filename, img)
+    ## [Save image]
+
+def access_pixel():
+    # Accessing pixel intensity values
+    img = np.empty((4,4,3), np.uint8)
+    y = 0
+    x = 0
+    ## [Pixel access 1]
+    intensity = img[y,x]
+    ## [Pixel access 1]
+
+    ## [Pixel access 3]
+    blue = img[y,x,0]
+    green = img[y,x,1]
+    red = img[y,x,2]
+    ## [Pixel access 3]
+
+    ## [Pixel access 5]
+    img[y,x] = 128
+    ## [Pixel access 5]
+
+def reference_counting():
+    # Memory management and reference counting
+    ## [Reference counting 2]
+    img = cv.imread('image.jpg')
+    img1 = np.copy(img)
+    ## [Reference counting 2]
+
+    ## [Reference counting 3]
+    img = cv.imread('image.jpg')
+    sobelx = cv.Sobel(img, cv.CV_32F, 1, 0);
+    ## [Reference counting 3]
+
+def primitive_operations():
+    img = np.empty((4,4,3), np.uint8)
+    ## [Set image to black]
+    img[:] = 0
+    ## [Set image to black]
+
+    ## [Select ROI]
+    smallImg = img[10:110,10:110]
+    ## [Select ROI]
+
+    ## [BGR to Gray]
+    img = cv.imread('image.jpg')
+    grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    ## [BGR to Gray]
+
+    src = np.ones((4,4), np.uint8)
+    ## [Convert to CV_32F]
+    dst = src.astype(np.float32)
+    ## [Convert to CV_32F]
+
+def visualize_images():
+    ## [imshow 1]
+    img = cv.imread('image.jpg')
+    cv.namedWindow('image', cv.WINDOW_AUTOSIZE)
+    cv.imshow('image', img)
+    cv.waitKey()
+    ## [imshow 1]
+
+    ## [imshow 2]
+    img = cv.imread('image.jpg')
+    grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    sobelx = cv.Sobel(grey, cv.CV_32F, 1, 0)
+    # find minimum and maximum intensities
+    minVal = np.amin(sobelx)
+    maxVal = np.amax(sobelx)
+    draw = cv.convertScaleAbs(sobelx, alpha=255.0/(maxVal - minVal), beta=-minVal * 255.0/(maxVal - minVal))
+    cv.namedWindow('image', cv.WINDOW_AUTOSIZE)
+    cv.imshow('image', draw)
+    cv.waitKey()
+    ## [imshow 2]
diff --git a/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py
new file mode 100644
index 0000000000..28baf3f8a0
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py
@@ -0,0 +1,55 @@
+from __future__ import print_function
+from builtins import input
+import cv2 as cv
+import numpy as np
+import argparse
+
+# Read image given by user
+## [basic-linear-transform-load]
+parser = argparse.ArgumentParser(description='Code for Changing the contrast and brightness of an image! tutorial.')
+parser.add_argument('--input', help='Path to input image.', default='../data/lena.jpg')
+args = parser.parse_args()
+
+image = cv.imread(args.input)
+if image is None:
+    print('Could not open or find the image: ', args.input)
+    exit(0)
+## [basic-linear-transform-load]
+
+## [basic-linear-transform-output]
+new_image = np.zeros(image.shape, image.dtype)
+## [basic-linear-transform-output]
+
+## [basic-linear-transform-parameters]
+alpha = 1.0 # Simple contrast control
+beta = 0    # Simple brightness control
+
+# Initialize values
+print(' Basic Linear Transforms ')
+print('-------------------------')
+try:
+    alpha = float(input('* Enter the alpha value [1.0-3.0]: '))
+    beta = int(input('* Enter the beta value [0-100]: '))
+except ValueError:
+    print('Error, not a number')
+## [basic-linear-transform-parameters]
+
+# Do the operation new_image(i,j) = alpha*image(i,j) + beta
+# Instead of these 'for' loops we could have used simply:
+# new_image = cv.convertScaleAbs(image, alpha=alpha, beta=beta)
+# but we wanted to show you how to access the pixels :)
+## [basic-linear-transform-operation]
+for y in range(image.shape[0]):
+    for x in range(image.shape[1]):
+        for c in range(image.shape[2]):
+            new_image[y,x,c] = np.clip(alpha*image[y,x,c] + beta, 0, 255)
+## [basic-linear-transform-operation]
+
+## [basic-linear-transform-display]
+# Show stuff
+cv.imshow('Original Image', image)
+cv.imshow('New Image', new_image)
+
+# Wait until user press some key
+cv.waitKey()
+## [basic-linear-transform-display]
diff --git a/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
new file mode 100644
index 0000000000..704df1aecb
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
@@ -0,0 +1,74 @@
+from __future__ import print_function
+from __future__ import division
+import cv2 as cv
+import numpy as np
+import argparse
+
+alpha = 1.0
+alpha_max = 500
+beta = 0
+beta_max = 200
+gamma = 1.0
+gamma_max = 200
+
+def basicLinearTransform():
+    res = cv.convertScaleAbs(img_original, alpha=alpha, beta=beta)
+    img_corrected = cv.hconcat([img_original, res])
+    cv.imshow("Brightness and contrast adjustments", img_corrected)
+
+def gammaCorrection():
+    ## [changing-contrast-brightness-gamma-correction]
+    lookUpTable = np.empty((1,256), np.uint8)
+    for i in range(256):
+        lookUpTable[0,i] = np.clip(pow(i / 255.0, gamma) * 255.0, 0, 255)
+
+    res = cv.LUT(img_original, lookUpTable)
+    ## [changing-contrast-brightness-gamma-correction]
+
+    img_gamma_corrected = cv.hconcat([img_original, res]);
+    cv.imshow("Gamma correction", img_gamma_corrected);
+
+def on_linear_transform_alpha_trackbar(val):
+    global alpha
+    alpha = val / 100
+    basicLinearTransform()
+
+def on_linear_transform_beta_trackbar(val):
+    global beta
+    beta = val - 100
+    basicLinearTransform()
+
+def on_gamma_correction_trackbar(val):
+    global gamma
+    gamma = val / 100
+    gammaCorrection()
+
+parser = argparse.ArgumentParser(description='Code for Changing the contrast and brightness of an image! tutorial.')
+parser.add_argument('--input', help='Path to input image.', default='../data/lena.jpg')
+args = parser.parse_args()
+
+img_original = cv.imread(args.input)
+if img_original is None:
+    print('Could not open or find the image: ', args.input)
+    exit(0)
+
+img_corrected = np.empty((img_original.shape[0], img_original.shape[1]*2, img_original.shape[2]), img_original.dtype)
+img_gamma_corrected = np.empty((img_original.shape[0], img_original.shape[1]*2, img_original.shape[2]), img_original.dtype)
+
+img_corrected = cv.hconcat([img_original, img_original])
+img_gamma_corrected = cv.hconcat([img_original, img_original])
+
+cv.namedWindow('Brightness and contrast adjustments')
+cv.namedWindow('Gamma correction')
+
+alpha_init = int(alpha *100)
+cv.createTrackbar('Alpha gain (contrast)', 'Brightness and contrast adjustments', alpha_init, alpha_max, on_linear_transform_alpha_trackbar)
+beta_init = beta + 100
+cv.createTrackbar('Beta bias (brightness)', 'Brightness and contrast adjustments', beta_init, beta_max, on_linear_transform_beta_trackbar)
+gamma_init = int(gamma * 100)
+cv.createTrackbar('Gamma correction', 'Gamma correction', gamma_init, gamma_max, on_gamma_correction_trackbar)
+
+on_linear_transform_alpha_trackbar(alpha_init)
+on_gamma_correction_trackbar(gamma_init)
+
+cv.waitKey()
diff --git a/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py b/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
new file mode 100644
index 0000000000..af312d58f5
--- /dev/null
+++ b/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
@@ -0,0 +1,100 @@
+from __future__ import print_function
+from __future__ import division
+import cv2 as cv
+import numpy as np
+import argparse
+from math import atan2, cos, sin, sqrt, pi
+
+def drawAxis(img, p_, q_, colour, scale):
+    p = list(p_)
+    q = list(q_)
+    ## [visualization1]
+    angle = atan2(p[1] - q[1], p[0] - q[0]) # angle in radians
+    hypotenuse = sqrt((p[1] - q[1]) * (p[1] - q[1]) + (p[0] - q[0]) * (p[0] - q[0]))
+
+    # Here we lengthen the arrow by a factor of scale
+    q[0] = p[0] - scale * hypotenuse * cos(angle)
+    q[1] = p[1] - scale * hypotenuse * sin(angle)
+    cv.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv.LINE_AA)
+
+    # create the arrow hooks
+    p[0] = q[0] + 9 * cos(angle + pi / 4)
+    p[1] = q[1] + 9 * sin(angle + pi / 4)
+    cv.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv.LINE_AA)
+
+    p[0] = q[0] + 9 * cos(angle - pi / 4)
+    p[1] = q[1] + 9 * sin(angle - pi / 4)
+    cv.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv.LINE_AA)
+    ## [visualization1]
+
+def getOrientation(pts, img):
+    ## [pca]
+    # Construct a buffer used by the pca analysis
+    sz = len(pts)
+    data_pts = np.empty((sz, 2), dtype=np.float64)
+    for i in range(data_pts.shape[0]):
+        data_pts[i,0] = pts[i,0,0]
+        data_pts[i,1] = pts[i,0,1]
+
+    # Perform PCA analysis
+    mean = np.empty((0))
+    mean, eigenvectors, eigenvalues = cv.PCACompute2(data_pts, mean)
+
+    # Store the center of the object
+    cntr = (int(mean[0,0]), int(mean[0,1]))
+    ## [pca]
+
+    ## [visualization]
+    # Draw the principal components
+    cv.circle(img, cntr, 3, (255, 0, 255), 2)
+    p1 = (cntr[0] + 0.02 * eigenvectors[0,0] * eigenvalues[0,0], cntr[1] + 0.02 * eigenvectors[0,1] * eigenvalues[0,0])
+    p2 = (cntr[0] - 0.02 * eigenvectors[1,0] * eigenvalues[1,0], cntr[1] - 0.02 * eigenvectors[1,1] * eigenvalues[1,0])
+    drawAxis(img, cntr, p1, (0, 255, 0), 1)
+    drawAxis(img, cntr, p2, (255, 255, 0), 5)
+
+    angle = atan2(eigenvectors[0,1], eigenvectors[0,0]) # orientation in radians
+    ## [visualization]
+
+    return angle
+
+## [pre-process]
+# Load image
+parser = argparse.ArgumentParser(description='Code for Introduction to Principal Component Analysis (PCA) tutorial.\
+                                              This program demonstrates how to use OpenCV PCA to extract the orientation of an object.')
+parser.add_argument('--input', help='Path to input image.', default='../data/pca_test1.jpg')
+args = parser.parse_args()
+
+src = cv.imread(args.input)
+# Check if image is loaded successfully
+if src is None:
+    print('Could not open or find the image: ', args.input)
+    exit(0)
+
+cv.imshow('src', src)
+
+# Convert image to grayscale
+gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
+
+# Convert image to binary
+_, bw = cv.threshold(gray, 50, 255, cv.THRESH_BINARY | cv.THRESH_OTSU)
+## [pre-process]
+
+## [contours]
+# Find all the contours in the thresholded image
+_, contours, _ = cv.findContours(bw, cv.RETR_LIST, cv.CHAIN_APPROX_NONE)
+
+for i, c in enumerate(contours):
+    # Calculate the area of each contour
+    area = cv.contourArea(c);
+    # Ignore contours that are too small or too large
+    if area < 1e2 or 1e5 < area:
+        continue
+
+    # Draw each contour only for visualisation purposes
+    cv.drawContours(src, contours, i, (0, 0, 255), 2);
+    # Find the orientation of each shape
+    getOrientation(c, src)
+## [contours]
+
+cv.imshow('output', src)
+cv.waitKey()
diff --git a/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py b/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
new file mode 100644
index 0000000000..1a5f202420
--- /dev/null
+++ b/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
@@ -0,0 +1,62 @@
+import cv2 as cv
+import numpy as np
+
+# Set up training data
+## [setup1]
+labels = np.array([1, -1, -1, -1])
+trainingData = np.matrix([[501, 10], [255, 10], [501, 255], [10, 501]], dtype=np.float32)
+## [setup1]
+
+# Train the SVM
+## [init]
+svm = cv.ml.SVM_create()
+svm.setType(cv.ml.SVM_C_SVC)
+svm.setKernel(cv.ml.SVM_LINEAR)
+svm.setTermCriteria((cv.TERM_CRITERIA_MAX_ITER, 100, 1e-6))
+## [init]
+## [train]
+svm.train(trainingData, cv.ml.ROW_SAMPLE, labels)
+## [train]
+
+# Data for visual representation
+width = 512
+height = 512
+image = np.zeros((height, width, 3), dtype=np.uint8)
+
+# Show the decision regions given by the SVM
+## [show]
+green = (0,255,0)
+blue = (255,0,0)
+for i in range(image.shape[0]):
+    for j in range(image.shape[1]):
+        sampleMat = np.matrix([[j,i]], dtype=np.float32)
+        response = svm.predict(sampleMat)[1]
+
+        if response == 1:
+            image[i,j] = green
+        elif response == -1:
+            image[i,j] = blue
+## [show]
+
+# Show the training data
+## [show_data]
+thickness = -1
+cv.circle(image, (501,  10), 5, (  0,   0,   0), thickness)
+cv.circle(image, (255,  10), 5, (255, 255, 255), thickness)
+cv.circle(image, (501, 255), 5, (255, 255, 255), thickness)
+cv.circle(image, ( 10, 501), 5, (255, 255, 255), thickness)
+## [show_data]
+
+# Show support vectors
+## [show_vectors]
+thickness = 2
+sv = svm.getUncompressedSupportVectors()
+
+for i in range(sv.shape[0]):
+    cv.circle(image, (sv[i,0], sv[i,1]), 6, (128, 128, 128), thickness)
+## [show_vectors]
+
+cv.imwrite('result.png', image) # save the image
+
+cv.imshow('SVM Simple Example', image) # show it to the user
+cv.waitKey()
diff --git a/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py b/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
new file mode 100644
index 0000000000..fc4b56c454
--- /dev/null
+++ b/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
@@ -0,0 +1,117 @@
+from __future__ import print_function
+import cv2 as cv
+import numpy as np
+import random as rng
+
+NTRAINING_SAMPLES = 100 # Number of training samples per class
+FRAC_LINEAR_SEP = 0.9   # Fraction of samples which compose the linear separable part
+
+# Data for visual representation
+WIDTH = 512
+HEIGHT = 512
+I = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)
+
+# --------------------- 1. Set up training data randomly ---------------------------------------
+trainData = np.empty((2*NTRAINING_SAMPLES, 2), dtype=np.float32)
+labels = np.empty((2*NTRAINING_SAMPLES, 1), dtype=np.int32)
+
+rng.seed(100) # Random value generation class
+
+# Set up the linearly separable part of the training data
+nLinearSamples = int(FRAC_LINEAR_SEP * NTRAINING_SAMPLES)
+
+## [setup1]
+# Generate random points for the class 1
+trainClass = trainData[0:nLinearSamples,:]
+# The x coordinate of the points is in [0, 0.4)
+c = trainClass[:,0:1]
+c[:] = np.random.uniform(0.0, 0.4 * WIDTH, c.shape)
+# The y coordinate of the points is in [0, 1)
+c = trainClass[:,1:2]
+c[:] = np.random.uniform(0.0, HEIGHT, c.shape)
+
+# Generate random points for the class 2
+trainClass = trainData[2*NTRAINING_SAMPLES-nLinearSamples:2*NTRAINING_SAMPLES,:]
+# The x coordinate of the points is in [0.6, 1]
+c = trainClass[:,0:1]
+c[:] = np.random.uniform(0.6*WIDTH, WIDTH, c.shape)
+# The y coordinate of the points is in [0, 1)
+c = trainClass[:,1:2]
+c[:] = np.random.uniform(0.0, HEIGHT, c.shape)
+## [setup1]
+
+#------------------ Set up the non-linearly separable part of the training data ---------------
+## [setup2]
+# Generate random points for the classes 1 and 2
+trainClass = trainData[nLinearSamples:2*NTRAINING_SAMPLES-nLinearSamples,:]
+# The x coordinate of the points is in [0.4, 0.6)
+c = trainClass[:,0:1]
+c[:] = np.random.uniform(0.4*WIDTH, 0.6*WIDTH, c.shape)
+# The y coordinate of the points is in [0, 1)
+c = trainClass[:,1:2]
+c[:] = np.random.uniform(0.0, HEIGHT, c.shape)
+## [setup2]
+
+#------------------------- Set up the labels for the classes ---------------------------------
+labels[0:NTRAINING_SAMPLES,:] = 1                   # Class 1
+labels[NTRAINING_SAMPLES:2*NTRAINING_SAMPLES,:] = 2 # Class 2
+
+#------------------------ 2. Set up the support vector machines parameters --------------------
+print('Starting training process')
+## [init]
+svm = cv.ml.SVM_create()
+svm.setType(cv.ml.SVM_C_SVC)
+svm.setC(0.1)
+svm.setKernel(cv.ml.SVM_LINEAR)
+svm.setTermCriteria((cv.TERM_CRITERIA_MAX_ITER, int(1e7), 1e-6))
+## [init]
+
+#------------------------ 3. Train the svm ----------------------------------------------------
+## [train]
+svm.train(trainData, cv.ml.ROW_SAMPLE, labels)
+## [train]
+print('Finished training process')
+
+#------------------------ 4. Show the decision regions ----------------------------------------
+## [show]
+green = (0,100,0)
+blue = (100,0,0)
+for i in range(I.shape[0]):
+    for j in range(I.shape[1]):
+        sampleMat = np.matrix([[j,i]], dtype=np.float32)
+        response = svm.predict(sampleMat)[1]
+
+        if response == 1:
+            I[i,j] = green
+        elif response == 2:
+            I[i,j] = blue
+## [show]
+
+#----------------------- 5. Show the training data --------------------------------------------
+## [show_data]
+thick = -1
+# Class 1
+for i in range(NTRAINING_SAMPLES):
+    px = trainData[i,0]
+    py = trainData[i,1]
+    cv.circle(I, (px, py), 3, (0, 255, 0), thick)
+
+# Class 2
+for i in range(NTRAINING_SAMPLES, 2*NTRAINING_SAMPLES):
+    px = trainData[i,0]
+    py = trainData[i,1]
+    cv.circle(I, (px, py), 3, (255, 0, 0), thick)
+## [show_data]
+
+#------------------------- 6. Show support vectors --------------------------------------------
+## [show_vectors]
+thick = 2
+sv = svm.getUncompressedSupportVectors()
+
+for i in range(sv.shape[0]):
+    cv.circle(I, (sv[i,0], sv[i,1]), 6, (128, 128, 128), thick)
+## [show_vectors]
+
+cv.imwrite('result.png', I)                      # save the Image
+cv.imshow('SVM for Non-Linear Training Data', I) # show it to the user
+cv.waitKey()