Merge remote-tracking branch 'upstream/3.4' into merge-3.4

7 years ago · 4560909a5e
parent 3f65924c45 6c4f618db5
commit 4560909a5e
123 changed files with 7025 additions and 2444 deletions
--- a/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
+++ b/3rdparty/ittnotify/src/ittnotify/ittnotify_config.h
@ -335,7 +335,7 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 #ifdef SDL_STRNCPY_S
 #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
 #else
-#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, l)
+#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, b)
 #endif /* SDL_STRNCPY_S */

 #define __itt_fstrdup(s)          strdup(s)
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@ -47,6 +47,10 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -W
                                     -Wsuggest-override -Winconsistent-missing-override
                                     -Wimplicit-fallthrough
 )
+if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
+endif()
+
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4018 /wd4099 /wd4100 /wd4101 /wd4127 /wd4189 /wd4245 /wd4305 /wd4389 /wd4512 /wd4701 /wd4702 /wd4706 /wd4800) # vs2005
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4334) # vs2005 Win64
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244) # vs2008
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@ -29,6 +29,9 @@ if(CV_ICC)
      -wd265 -wd858 -wd873 -wd2196
  )
 endif()
+if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
+endif()

 # Easier to support different versions of protobufs
 function(append_if_exist OUTPUT_LIST)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1403,7 +1403,17 @@ if(WITH_HALIDE OR HAVE_HALIDE)
 endif()

 if(WITH_INF_ENGINE OR HAVE_INF_ENGINE)
-  status("    Inference Engine:"     HAVE_INF_ENGINE     THEN "YES (${INF_ENGINE_LIBRARIES} ${INF_ENGINE_INCLUDE_DIRS})" ELSE NO)
+  if(HAVE_INF_ENGINE)
+    set(__msg "YES")
+    if(DEFINED INF_ENGINE_VERSION)
+      set(__msg "YES (ver ${INF_ENGINE_VERSION})")
+    endif()
+    status("    Inference Engine:" "${__msg}")
+    status("                libs:" "${INF_ENGINE_LIBRARIES}")
+    status("            includes:" "${INF_ENGINE_INCLUDE_DIRS}")
+  else()
+    status("    Inference Engine:"     "NO")
+  endif()
 endif()

 if(WITH_EIGEN OR HAVE_EIGEN)
--- a/apps/createsamples/utility.cpp
+++ b/apps/createsamples/utility.cpp
@ -54,6 +54,10 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/calib3d.hpp"

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 using namespace cv;

 #ifndef PATH_MAX
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -8,14 +8,23 @@ if(NOT APPLE AND CV_CLANG)
  return()
 endif()

-set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

-if(ANDROID)
-  set(CUDA_TARGET_OS_VARIANT "Android")
-endif()
-find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+if(((NOT CMAKE_VERSION VERSION_LESS "3.9.0")  # requires https://gitlab.kitware.com/cmake/cmake/merge_requests/663
+      OR OPENCV_CUDA_FORCE_EXTERNAL_CMAKE_MODULE)
+    AND NOT OPENCV_CUDA_FORCE_BUILTIN_CMAKE_MODULE)
+  ocv_update(CUDA_LINK_LIBRARIES_KEYWORD "LINK_PRIVATE")
+  find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+else()
+  # Use OpenCV's patched "FindCUDA" module
+  set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+  if(ANDROID)
+    set(CUDA_TARGET_OS_VARIANT "Android")
+  endif()
+  find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)

-list(REMOVE_AT CMAKE_MODULE_PATH 0)
+  list(REMOVE_AT CMAKE_MODULE_PATH 0)
+endif()

 if(CUDA_FOUND)
  set(HAVE_CUDA 1)
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@ -16,22 +16,32 @@ macro(ie_fail)
 endmacro()


+find_package(InferenceEngine QUIET)
+if(InferenceEngine_FOUND)
+  set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
+  set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
+  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
+  set(HAVE_INF_ENGINE TRUE)
+  return()
+endif()
+
+ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH)
+
 if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
    set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
-    if(DEFINED ENV{INTEL_CVSDK_DIR})
-        list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}")
-        list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}/inference_engine")
-    endif()
    if(DEFINED INTEL_CVSDK_DIR)
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}")
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/inference_engine")
+        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
+        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
    endif()

    if(NOT ie_root_paths)
-        list(APPEND ie_root_paths "/opt/intel/deeplearning_deploymenttoolkit/deployment_tools/inference_engine")
+        list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/")
    endif()

    find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths})
+    if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
+      unset(INF_ENGINE_ROOT_DIR CACHE)
+    endif()
 endif()

 set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory")
@ -40,6 +50,7 @@ if(NOT INF_ENGINE_ROOT_DIR
    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}"
    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp"
 )
+    message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
    ie_fail()
 endif()

@ -47,19 +58,19 @@ set(INF_ENGINE_LIBRARIES "")

 set(ie_lib_list inference_engine)

+if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}")
+  set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}")
+endif()
+
 link_directories(
-  ${INTEL_CVSDK_DIR}/inference_engine/external/mkltiny_lnx/lib
-  ${INTEL_CVSDK_DIR}/inference_engine/external/cldnn/lib
+  ${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
+  ${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
 )

 foreach(lib ${ie_lib_list})
-    find_library(${lib}
-        NAMES ${lib}
-        # For inference_engine
-        HINTS ${IE_PLUGINS_PATH}
-        HINTS "$ENV{IE_PLUGINS_PATH}"
-    )
+    find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH})
    if(NOT ${lib})
+        message(WARNING "DL IE: Can't find library: '${lib}'")
        ie_fail()
    endif()
    list(APPEND INF_ENGINE_LIBRARIES ${${lib}})
--- a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
+++ b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
@ -53,48 +53,143 @@ Theory
 Code
 ----

+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp)
+
 -   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
-@include BasicLinearTransforms.cpp
+    @include samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java)
+
+-   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
+    @include samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py)
+
+-   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
+    @include samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py
+@end_toggle

 Explanation
 -----------

-#  We begin by creating parameters to save \f$\alpha\f$ and \f$\beta\f$ to be entered by the user:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-parameters
+-   We load an image using @ref cv::imread and save it in a Mat object:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-load
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-load
+@end_toggle

-#  We load an image using @ref cv::imread and save it in a Mat object:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-load
-#  Now, since we will make some transformations to this image, we need a new Mat object to store
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-load
+@end_toggle
+
+-   Now, since we will make some transformations to this image, we need a new Mat object to store
    it. Also, we want this to have the following features:

    -   Initial pixel values equal to zero
    -   Same size and type as the original image
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-output
-    We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer based on
-    *image.size()* and *image.type()*

-#  Now, to perform the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ we will access to each
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-output
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-output
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-output
+@end_toggle
+
+We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer based on
+*image.size()* and *image.type()*
+
+-   We ask now the values of \f$\alpha\f$ and \f$\beta\f$ to be entered by the user:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-parameters
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-parameters
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-parameters
+@end_toggle
+
+-   Now, to perform the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ we will access to each
    pixel in image. Since we are operating with BGR images, we will have three values per pixel (B,
    G and R), so we will also access them separately. Here is the piece of code:
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-operation
-    Notice the following:
-    -   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
-        where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
-    -   Since the operation \f$\alpha \cdot p(i,j) + \beta\f$ can give values out of range or not
-        integers (if \f$\alpha\f$ is float), we use cv::saturate_cast to make sure the
-        values are valid.

-#  Finally, we create windows and show the images, the usual way.
-    @snippet BasicLinearTransforms.cpp basic-linear-transform-display
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-operation
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-operation
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-operation
+@end_toggle
+
+Notice the following (**C++ code only**):
+-   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
+    where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
+-   Since the operation \f$\alpha \cdot p(i,j) + \beta\f$ can give values out of range or not
+    integers (if \f$\alpha\f$ is float), we use cv::saturate_cast to make sure the
+    values are valid.
+
+-   Finally, we create windows and show the images, the usual way.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp basic-linear-transform-display
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/BasicLinearTransformsDemo.java basic-linear-transform-display
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/BasicLinearTransforms.py basic-linear-transform-display
+@end_toggle

@note
    Instead of using the **for** loops to access each pixel, we could have simply used this command:
-    @code{.cpp}
-    image.convertTo(new_image, -1, alpha, beta);
-    @endcode
-    where @ref cv::Mat::convertTo would effectively perform *new_image = a*image + beta\*. However, we
-    wanted to show you how to access each pixel. In any case, both methods give the same result but
-    convertTo is more optimized and works a lot faster.
+
+@add_toggle_cpp
+@code{.cpp}
+image.convertTo(new_image, -1, alpha, beta);
+@endcode
+@end_toggle
+
+@add_toggle_java
+@code{.java}
+image.convertTo(newImage, -1, alpha, beta);
+@endcode
+@end_toggle
+
+@add_toggle_python
+@code{.py}
+new_image = cv.convertScaleAbs(image, alpha=alpha, beta=beta)
+@endcode
+@end_toggle
+
+where @ref cv::Mat::convertTo would effectively perform *new_image = a*image + beta\*. However, we
+wanted to show you how to access each pixel. In any case, both methods give the same result but
+convertTo is more optimized and works a lot faster.

 Result
 ------
@ -185,10 +280,31 @@ and are not intended to be used as a replacement of a raster graphics editor!**

 ### Code

+@add_toggle_cpp
 Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp).
+@end_toggle
+
+@add_toggle_java
+Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java).
+@end_toggle
+
+@add_toggle_python
+Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py).
+@end_toggle
+
 Code for the gamma correction:

-@snippet changing_contrast_brightness_image.cpp changing-contrast-brightness-gamma-correction
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp changing-contrast-brightness-gamma-correction
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ImgProc/changing_contrast_brightness_image/ChangingContrastBrightnessImageDemo.java changing-contrast-brightness-gamma-correction
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py changing-contrast-brightness-gamma-correction
+@end_toggle

 A look-up table is used to improve the performance of the computation as only 256 values needs to be calculated once.

--- a/doc/tutorials/core/mat_operations.markdown
+++ b/doc/tutorials/core/mat_operations.markdown
@ -7,25 +7,50 @@ Input/Output
 ### Images

 Load an image from a file:
-@code{.cpp}
-    Mat img = imread(filename)
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Load an image from a file
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Load an image from a file
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Load an image from a file
+@end_toggle

 If you read a jpg file, a 3 channel image is created by default. If you need a grayscale image, use:

-@code{.cpp}
-    Mat img = imread(filename, IMREAD_GRAYSCALE);
-@endcode
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Load an image from a file in grayscale
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Load an image from a file in grayscale
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Load an image from a file in grayscale
+@end_toggle
+
+@note Format of the file is determined by its content (first few bytes). To save an image to a file:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Save image
+@end_toggle

-@note format of the file is determined by its content (first few bytes) Save an image to a file:
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Save image
+@end_toggle

-@code{.cpp}
-    imwrite(filename, img);
-@endcode
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Save image
+@end_toggle

-@note format of the file is determined by its extension.
+@note Format of the file is determined by its extension.

-@note use imdecode and imencode to read and write image from/to memory rather than a file.
+@note Use cv::imdecode and cv::imencode to read and write an image from/to memory rather than a file.

 Basic operations with images
 ----------------------------
@ -35,49 +60,65 @@ Basic operations with images
 In order to get pixel intensity value, you have to know the type of an image and the number of
 channels. Here is an example for a single channel grey scale image (type 8UC1) and pixel coordinates
 x and y:
-@code{.cpp}
-    Scalar intensity = img.at<uchar>(y, x);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Pixel access 1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 1
+@end_toggle
+
+C++ version only:
 intensity.val[0] contains a value from 0 to 255. Note the ordering of x and y. Since in OpenCV
 images are represented by the same structure as matrices, we use the same convention for both
 cases - the 0-based row index (or y-coordinate) goes first and the 0-based column index (or
-x-coordinate) follows it. Alternatively, you can use the following notation:
-@code{.cpp}
-    Scalar intensity = img.at<uchar>(Point(x, y));
-@endcode
+x-coordinate) follows it. Alternatively, you can use the following notation (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 2
+
 Now let us consider a 3 channel image with BGR color ordering (the default format returned by
 imread):
-@code{.cpp}
-    Vec3b intensity = img.at<Vec3b>(y, x);
-    uchar blue = intensity.val[0];
-    uchar green = intensity.val[1];
-    uchar red = intensity.val[2];
-@endcode
+
+**C++ code**
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 3
+
+**Python Python**
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 3
+
 You can use the same method for floating-point images (for example, you can get such an image by
-running Sobel on a 3 channel image):
-@code{.cpp}
-    Vec3f intensity = img.at<Vec3f>(y, x);
-    float blue = intensity.val[0];
-    float green = intensity.val[1];
-    float red = intensity.val[2];
-@endcode
+running Sobel on a 3 channel image) (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 4
+
 The same method can be used to change pixel intensities:
-@code{.cpp}
-    img.at<uchar>(y, x) = 128;
-@endcode
-There are functions in OpenCV, especially from calib3d module, such as projectPoints, that take an
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Pixel access 5
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Pixel access 5
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Pixel access 5
+@end_toggle
+
+There are functions in OpenCV, especially from calib3d module, such as cv::projectPoints, that take an
 array of 2D or 3D points in the form of Mat. Matrix should contain exactly one column, each row
 corresponds to a point, matrix type should be 32FC2 or 32FC3 correspondingly. Such a matrix can be
-easily constructed from `std::vector`:
-@code{.cpp}
-    vector<Point2f> points;
-    //... fill the array
-    Mat pointsMat = Mat(points);
-@endcode
-One can access a point in this matrix using the same method Mat::at :
-@code{.cpp}
-Point2f point = pointsMat.at<Point2f>(i, 0);
-@endcode
+easily constructed from `std::vector` (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Mat from points vector
+
+One can access a point in this matrix using the same method `Mat::at` (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Point access

 ### Memory management and reference counting

@ -85,91 +126,141 @@ Mat is a structure that keeps matrix/image characteristics (rows and columns num
 and a pointer to data. So nothing prevents us from having several instances of Mat corresponding to
 the same data. A Mat keeps a reference count that tells if data has to be deallocated when a
 particular instance of Mat is destroyed. Here is an example of creating two matrices without copying
-data:
-@code{.cpp}
-    std::vector<Point3f> points;
-    // .. fill the array
-    Mat pointsMat = Mat(points).reshape(1);
-@endcode
-As a result we get a 32FC1 matrix with 3 columns instead of 32FC3 matrix with 1 column. pointsMat
+data (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 1
+
+As a result, we get a 32FC1 matrix with 3 columns instead of 32FC3 matrix with 1 column. `pointsMat`
 uses data from points and will not deallocate the memory when destroyed. In this particular
-instance, however, developer has to make sure that lifetime of points is longer than of pointsMat.
+instance, however, developer has to make sure that lifetime of `points` is longer than of `pointsMat`
 If we need to copy the data, this is done using, for example, cv::Mat::copyTo or cv::Mat::clone:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat img1 = img.clone();
-@endcode
-To the contrary with C API where an output image had to be created by developer, an empty output Mat
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Reference counting 2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Reference counting 2
+@end_toggle
+
+To the contrary with C API where an output image had to be created by the developer, an empty output Mat
 can be supplied to each function. Each implementation calls Mat::create for a destination matrix.
 This method allocates data for a matrix if it is empty. If it is not empty and has the correct size
-and type, the method does nothing. If, however, size or type are different from input arguments, the
+and type, the method does nothing. If however, size or type are different from the input arguments, the
 data is deallocated (and lost) and a new data is allocated. For example:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat sobelx;
-    Sobel(img, sobelx, CV_32F, 1, 0);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Reference counting 3
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Reference counting 3
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Reference counting 3
+@end_toggle

 ### Primitive operations

 There is a number of convenient operators defined on a matrix. For example, here is how we can make
-a black image from an existing greyscale image \`img\`:
-@code{.cpp}
-    img = Scalar(0);
-@endcode
+a black image from an existing greyscale image `img`
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Set image to black
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Set image to black
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Set image to black
+@end_toggle
+
 Selecting a region of interest:
-@code{.cpp}
-    Rect r(10, 10, 100, 100);
-    Mat smallImg = img(r);
-@endcode
-A conversion from Mat to C API data structures:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    IplImage img1 = img;
-    CvMat m = img;
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Select ROI
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Select ROI
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Select ROI
+@end_toggle
+
+A conversion from Mat to C API data structures (**C++ only**):
+
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp C-API conversion

 Note that there is no data copying here.

-Conversion from color to grey scale:
-@code{.cpp}
-    Mat img = imread("image.jpg"); // loading a 8UC3 image
-    Mat grey;
-    cvtColor(img, grey, COLOR_BGR2GRAY);
-@endcode
+Conversion from color to greyscale:
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp BGR to Gray
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java BGR to Gray
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py BGR to Gray
+@end_toggle
+
 Change image type from 8UC1 to 32FC1:
-@code{.cpp}
-    src.convertTo(dst, CV_32F);
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp Convert to CV_32F
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java Convert to CV_32F
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py Convert to CV_32F
+@end_toggle

 ### Visualizing images

 It is very useful to see intermediate results of your algorithm during development process. OpenCV
 provides a convenient way of visualizing images. A 8U image can be shown using:
-@code{.cpp}
-    Mat img = imread("image.jpg");

-    namedWindow("image", WINDOW_AUTOSIZE);
-    imshow("image", img);
-    waitKey();
-@endcode
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp imshow 1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java imshow 1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py imshow 1
+@end_toggle

 A call to waitKey() starts a message passing cycle that waits for a key stroke in the "image"
 window. A 32F image needs to be converted to 8U type. For example:
-@code{.cpp}
-    Mat img = imread("image.jpg");
-    Mat grey;
-    cvtColor(img, grey, COLOR_BGR2GRAY);
-
-    Mat sobelx;
-    Sobel(grey, sobelx, CV_32F, 1, 0);
-
-    double minVal, maxVal;
-    minMaxLoc(sobelx, &minVal, &maxVal); //find minimum and maximum intensities
-    Mat draw;
-    sobelx.convertTo(draw, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
-
-    namedWindow("image", WINDOW_AUTOSIZE);
-    imshow("image", draw);
-    waitKey();
-@endcode
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/core/mat_operations/mat_operations.cpp imshow 2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/core/mat_operations/MatOperations.java imshow 2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/core/mat_operations/mat_operations.py imshow 2
+@end_toggle
+
+@note Here cv::namedWindow is not necessary since it is immediately followed by cv::imshow.
+Nevertheless, it can be used to change the window properties or when using cv::createTrackbar
--- a/doc/tutorials/core/table_of_content_core.markdown
+++ b/doc/tutorials/core/table_of_content_core.markdown
@ -36,6 +36,10 @@ understanding how to manipulate the images on a pixel level.

 -   @subpage tutorial_mat_operations

+    *Languages:* C++, Java, Python
+
+    *Compatibility:* \> OpenCV 2.0
+
    Reading/writing images from file, accessing pixels, primitive operations, visualizing images.

 -   @subpage tutorial_adding_images
@ -50,6 +54,8 @@ understanding how to manipulate the images on a pixel level.

 -   @subpage tutorial_basic_linear_transform

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
--- a/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
+++ b/doc/tutorials/ml/introduction_to_pca/introduction_to_pca.markdown
@ -91,37 +91,112 @@ __Find the eigenvectors and eigenvalues of the covariance matrix__
 Source Code
 -----------

-This tutorial code's is shown lines below. You can also download it from
-    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp).
-@include cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
+@end_toggle

@note Another example using PCA for dimensionality reduction while maintaining an amount of variance can be found at [opencv_source_code/samples/cpp/pca.cpp](https://github.com/opencv/opencv/tree/master/samples/cpp/pca.cpp)

 Explanation
 -----------

-#  __Read image and convert it to binary__
+-   __Read image and convert it to binary__
+
+Here we apply the necessary pre-processing procedures in order to be able to detect the objects of interest.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pre-process
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java pre-process
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py pre-process
+@end_toggle
+
+-   __Extract objects of interest__
+
+Then find and filter contours by size and obtain the orientation of the remaining ones.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp contours
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java contours
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py contours
+@end_toggle
+
+-   __Extract orientation__
+
+Orientation is extracted by the call of getOrientation() function, which performs all the PCA procedure.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pca
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java pca
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py pca
+@end_toggle
+
+First the data need to be arranged in a matrix with size n x 2, where n is the number of data points we have. Then we can perform that PCA analysis. The calculated mean (i.e. center of mass) is stored in the _cntr_ variable and the eigenvectors and eigenvalues are stored in the corresponding std::vector’s.

-    Here we apply the necessary pre-processing procedures in order to be able to detect the objects of interest.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pre-process
+-   __Visualize result__

-#  __Extract objects of interest__
+The final result is visualized through the drawAxis() function, where the principal components are drawn in lines, and each eigenvector is multiplied by its eigenvalue and translated to the mean position.

-    Then find and filter contours by size and obtain the orientation of the remaining ones.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp contours
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization
+@end_toggle

-#  __Extract orientation__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java visualization
+@end_toggle

-    Orientation is extracted by the call of getOrientation() function, which performs all the PCA procedure.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp pca
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py visualization
+@end_toggle

-    First the data need to be arranged in a matrix with size n x 2, where n is the number of data points we have. Then we can perform that PCA analysis. The calculated mean (i.e. center of mass) is stored in the _cntr_ variable and the eigenvectors and eigenvalues are stored in the corresponding std::vector’s.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization1
+@end_toggle

-#  __Visualize result__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_pca/IntroductionToPCADemo.java visualization1
+@end_toggle

-    The final result is visualized through the drawAxis() function, where the principal components are drawn in lines, and each eigenvector is multiplied by its eigenvalue and translated to the mean position.
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization
-    @snippet samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp visualization1
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py visualization1
+@end_toggle

 Results
 -------
--- a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
+++ b/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
@ -96,25 +96,67 @@ Source Code

@note The following code has been implemented with OpenCV 3.0 classes and functions. An equivalent version of the code using OpenCV 2.4 can be found in [this page.](http://docs.opencv.org/2.4/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.html#introductiontosvms)

-@include cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
+@end_toggle

 Explanation
 -----------

-#  **Set up the training data**
+-   **Set up the training data**
+
+The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
+two different classes; one of the classes consists of one point and the other of three points.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup1
+@end_toggle

-    The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-    two different classes; one of the classes consists of one point and the other of three points.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
+@end_toggle

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup1
+The function @ref cv::ml::SVM::train that will be used afterwards requires the training data to be
+stored as @ref cv::Mat objects of floats. Therefore, we create these objects from the arrays
+defined above:

-    The function @ref cv::ml::SVM::train that will be used afterwards requires the training data to be
-    stored as @ref cv::Mat objects of floats. Therefore, we create these objects from the arrays
-    defined above:
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup2
+@end_toggle

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup2
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup2
+@end_toggle

-#  **Set up SVM's parameters**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
+@end_toggle
+
+-   **Set up SVM's parameters**

    In this tutorial we have introduced the theory of SVMs in the most simple case, when the
    training examples are spread into two classes that are linearly separable. However, SVMs can be
@ -123,35 +165,55 @@ Explanation
    we have to define some parameters before training the SVM. These parameters are stored in an
    object of the class @ref cv::ml::SVM.

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp init
-
-    Here:
-    -   *Type of SVM*. We choose here the type @ref cv::ml::SVM::C_SVC "C_SVC" that can be used for
-        n-class classification (n \f$\geq\f$ 2). The important feature of this type is that it deals
-        with imperfect separation of classes (i.e. when the training data is non-linearly separable).
-        This feature is not important here since the data is linearly separable and we chose this SVM
-        type only for being the most commonly used.
-
-    -   *Type of SVM kernel*. We have not talked about kernel functions since they are not
-        interesting for the training data we are dealing with. Nevertheless, let's explain briefly now
-        the main idea behind a kernel function. It is a mapping done to the training data to improve
-        its resemblance to a linearly separable set of data. This mapping consists of increasing the
-        dimensionality of the data and is done efficiently using a kernel function. We choose here the
-        type @ref cv::ml::SVM::LINEAR "LINEAR" which means that no mapping is done. This parameter is
-        defined using cv::ml::SVM::setKernel.
-
-    -   *Termination criteria of the algorithm*. The SVM training procedure is implemented solving a
-        constrained quadratic optimization problem in an **iterative** fashion. Here we specify a
-        maximum number of iterations and a tolerance error so we allow the algorithm to finish in
-        less number of steps even if the optimal hyperplane has not been computed yet. This
-        parameter is defined in a structure @ref cv::TermCriteria .
-
-#  **Train the SVM**
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp init
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java init
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py init
+@end_toggle
+
+Here:
+-   *Type of SVM*. We choose here the type @ref cv::ml::SVM::C_SVC "C_SVC" that can be used for
+    n-class classification (n \f$\geq\f$ 2). The important feature of this type is that it deals
+    with imperfect separation of classes (i.e. when the training data is non-linearly separable).
+    This feature is not important here since the data is linearly separable and we chose this SVM
+    type only for being the most commonly used.
+
+-   *Type of SVM kernel*. We have not talked about kernel functions since they are not
+    interesting for the training data we are dealing with. Nevertheless, let's explain briefly now
+    the main idea behind a kernel function. It is a mapping done to the training data to improve
+    its resemblance to a linearly separable set of data. This mapping consists of increasing the
+    dimensionality of the data and is done efficiently using a kernel function. We choose here the
+    type @ref cv::ml::SVM::LINEAR "LINEAR" which means that no mapping is done. This parameter is
+    defined using cv::ml::SVM::setKernel.
+
+-   *Termination criteria of the algorithm*. The SVM training procedure is implemented solving a
+    constrained quadratic optimization problem in an **iterative** fashion. Here we specify a
+    maximum number of iterations and a tolerance error so we allow the algorithm to finish in
+    less number of steps even if the optimal hyperplane has not been computed yet. This
+    parameter is defined in a structure @ref cv::TermCriteria .
+
+-   **Train the SVM**
    We call the method @ref cv::ml::SVM::train to build the SVM model.

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp train
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp train
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java train
+@end_toggle

-#  **Regions classified by the SVM**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py train
+@end_toggle
+
+-   **Regions classified by the SVM**

    The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
    this example we have used this method in order to color the space depending on the prediction done
@ -159,16 +221,36 @@ Explanation
    Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
    green if it is the class with label 1 and in blue if it is the class with label -1.

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show
+@end_toggle

-#  **Support vectors**
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show
+@end_toggle
+
+-   **Support vectors**

    We use here a couple of methods to obtain information about the support vectors.
    The method @ref cv::ml::SVM::getSupportVectors obtain all of the support
    vectors. We have used this methods here to find the training examples that are
    support vectors and highlight them.

-    @snippet cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show_vectors
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show_vectors
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show_vectors
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show_vectors
+@end_toggle

 Results
 -------
--- a/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
+++ b/doc/tutorials/ml/non_linear_svms/non_linear_svms.markdown
@ -92,81 +92,175 @@ You may also find the source code in `samples/cpp/tutorial_code/ml/non_linear_sv
@note The following code has been implemented with OpenCV 3.0 classes and functions. An equivalent version of the code
 using OpenCV 2.4 can be found in [this page.](http://docs.opencv.org/2.4/doc/tutorials/ml/non_linear_svms/non_linear_svms.html#nonlinearsvms)

-@include cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+@end_toggle
+
+@add_toggle_java
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java)
+
+-   **Code at glance:**
+    @include samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
+@end_toggle

 Explanation
 -----------

-#  __Set up the training data__
+-   __Set up the training data__
+
+The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
+two different classes. To make the exercise more appealing, the training data is generated
+randomly using a uniform probability density functions (PDFs).
+
+We have divided the generation of the training data into two main parts.
+
+In the first part we generate data for both classes that is linearly separable.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup1
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup1
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup1
+@end_toggle
+
+In the second part we create data for both classes that is non-linearly separable, data that
+overlaps.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup2
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup2
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup2
+@end_toggle
+
+-   __Set up SVM's parameters__
+
+@note In the previous tutorial @ref tutorial_introduction_to_svm there is an explanation of the
+attributes of the class @ref cv::ml::SVM that we configure here before training the SVM.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp init
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java init
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py init
+@end_toggle
+
+There are just two differences between the configuration we do here and the one that was done in
+the previous tutorial (@ref tutorial_introduction_to_svm) that we use as reference.

-    The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-    two different classes. To make the exercise more appealing, the training data is generated
-    randomly using a uniform probability density functions (PDFs).
+-   _C_. We chose here a small value of this parameter in order not to punish too much the
+    misclassification errors in the optimization. The idea of doing this stems from the will of
+    obtaining a solution close to the one intuitively expected. However, we recommend to get a
+    better insight of the problem by making adjustments to this parameter.

-    We have divided the generation of the training data into two main parts.
+    @note In this case there are just very few points in the overlapping region between classes.
+    By giving a smaller value to __FRAC_LINEAR_SEP__ the density of points can be incremented and the
+    impact of the parameter _C_ explored deeply.

-    In the first part we generate data for both classes that is linearly separable.
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup1
+-   _Termination Criteria of the algorithm_. The maximum number of iterations has to be
+    increased considerably in order to solve correctly a problem with non-linearly separable
+    training data. In particular, we have increased in five orders of magnitude this value.

-    In the second part we create data for both classes that is non-linearly separable, data that
-    overlaps.
-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup2
+-   __Train the SVM__

-#  __Set up SVM's parameters__
+We call the method @ref cv::ml::SVM::train to build the SVM model. Watch out that the training
+process may take a quite long time. Have patiance when your run the program.

-    @note In the previous tutorial @ref tutorial_introduction_to_svm there is an explanation of the
-    attributes of the class @ref cv::ml::SVM that we configure here before training the SVM.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp train
+@end_toggle

-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp init
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java train
+@end_toggle

-    There are just two differences between the configuration we do here and the one that was done in
-    the previous tutorial (@ref tutorial_introduction_to_svm) that we use as reference.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py train
+@end_toggle

-    -   _C_. We chose here a small value of this parameter in order not to punish too much the
-        misclassification errors in the optimization. The idea of doing this stems from the will of
-        obtaining a solution close to the one intuitively expected. However, we recommend to get a
-        better insight of the problem by making adjustments to this parameter.
+-   __Show the Decision Regions__

-        @note In this case there are just very few points in the overlapping region between classes.
-        By giving a smaller value to __FRAC_LINEAR_SEP__ the density of points can be incremented and the
-        impact of the parameter _C_ explored deeply.
+The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
+this example we have used this method in order to color the space depending on the prediction done
+by the SVM. In other words, an image is traversed interpreting its pixels as points of the
+Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
+dark green if it is the class with label 1 and in dark blue if it is the class with label 2.

-    -   _Termination Criteria of the algorithm_. The maximum number of iterations has to be
-        increased considerably in order to solve correctly a problem with non-linearly separable
-        training data. In particular, we have increased in five orders of magnitude this value.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show
+@end_toggle

-#  __Train the SVM__
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show
+@end_toggle

-    We call the method @ref cv::ml::SVM::train to build the SVM model. Watch out that the training
-    process may take a quite long time. Have patiance when your run the program.
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show
+@end_toggle

-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp train
+-   __Show the training data__

-#  __Show the Decision Regions__
+The method @ref cv::circle is used to show the samples that compose the training data. The samples
+of the class labeled with 1 are shown in light green and in light blue the samples of the class
+labeled with 2.

-    The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
-    this example we have used this method in order to color the space depending on the prediction done
-    by the SVM. In other words, an image is traversed interpreting its pixels as points of the
-    Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
-    dark green if it is the class with label 1 and in dark blue if it is the class with label 2.
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_data
+@end_toggle

-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_data
+@end_toggle

-#  __Show the training data__
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_data
+@end_toggle

-    The method @ref cv::circle is used to show the samples that compose the training data. The samples
-    of the class labeled with 1 are shown in light green and in light blue the samples of the class
-    labeled with 2.
+-   __Support vectors__

-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_data
+We use here a couple of methods to obtain information about the support vectors. The method
+@ref cv::ml::SVM::getSupportVectors obtain all support vectors. We have used this methods here
+to find the training examples that are support vectors and highlight them.

-#  __Support vectors__
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_vectors
+@end_toggle

-    We use here a couple of methods to obtain information about the support vectors. The method
-    @ref cv::ml::SVM::getSupportVectors obtain all support vectors. We have used this methods here
-    to find the training examples that are support vectors and highlight them.
+@add_toggle_java
+@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_vectors
+@end_toggle

-    @snippet cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_vectors
+@add_toggle_python
+@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_vectors
+@end_toggle

 Results
 -------
--- a/doc/tutorials/ml/table_of_content_ml.markdown
+++ b/doc/tutorials/ml/table_of_content_ml.markdown
@ -6,6 +6,8 @@ of data.

 -   @subpage tutorial_introduction_to_svm

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Fernando Iglesias García
@ -14,6 +16,8 @@ of data.

 -   @subpage tutorial_non_linear_svms

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Fernando Iglesias García
@ -23,6 +27,8 @@ of data.

 -   @subpage tutorial_introduction_to_pca

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Theodore Tsesmelis
--- a/modules/calib3d/misc/java/gen_dict.json
+++ b/modules/calib3d/misc/java/gen_dict.json
@ -17,6 +17,9 @@
            ]
        }
    },
+    "namespaces_dict": {
+        "cv.fisheye": "fisheye"
+    },
    "func_arg_fix" : {
        "findFundamentalMat"  : { "points1" : {"ctype" : "vector_Point2f"},
                                  "points2" : {"ctype" : "vector_Point2f"} },
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@ -513,10 +513,6 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
    {
        cvtColor(img, img, COLOR_BGR2GRAY);
    }
-    else
-    {
-        img.clone();
-    }

    int prev_sqr_size = 0;

@ -578,6 +574,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
    {
        if (flags & CALIB_CB_NORMALIZE_IMAGE)
        {
+            img = img.clone();
            equalizeHist(img, img);
        }

--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@ -2336,10 +2336,13 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
    _uu[2] = 1;
    cvCrossProduct(&uu, &t, &ww);
    nt = cvNorm(&t, 0, CV_L2);
+    CV_Assert(fabs(nt) > 0);
    nw = cvNorm(&ww, 0, CV_L2);
+    CV_Assert(fabs(nw) > 0);
    cvConvertScale(&ww, &ww, 1 / nw);
    cvCrossProduct(&t, &ww, &w3);
    nw = cvNorm(&w3, 0, CV_L2);
+    CV_Assert(fabs(nw) > 0);
    cvConvertScale(&w3, &w3, 1 / nw);
    _uu[2] = 0;

@ -3159,6 +3162,10 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
    Point3f* objPtData = objPtMat.ptr<Point3f>();
    Point2f* imgPtData1 = imgPtMat1.ptr<Point2f>();

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
    for( i = 0; i < nimages; i++, j += ni )
    {
        Mat objpt = objectPoints.getMat(i);
@ -3176,6 +3183,9 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
            memcpy( imgPtData2 + j, imgpt2.ptr(), ni*sizeof(imgPtData2[0]) );
        }
    }
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
 }

 static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
@ -3870,12 +3880,14 @@ float cv::rectify3Collinear( InputArray _cameraMatrix1, InputArray _distCoeffs1,

    int idx = fabs(t12(0,0)) > fabs(t12(1,0)) ? 0 : 1;
    double c = t12(idx,0), nt = norm(t12, CV_L2);
+    CV_Assert(fabs(nt) > 0);
    Mat_<double> uu = Mat_<double>::zeros(3,1);
    uu(idx, 0) = c > 0 ? 1 : -1;

    // calculate global Z rotation
    Mat_<double> ww = t12.cross(uu), wR;
    double nw = norm(ww, CV_L2);
+    CV_Assert(fabs(nw) > 0);
    ww *= acos(fabs(c)/nt)/nw;
    Rodrigues(ww, wR);

--- a/modules/calib3d/src/dls.cpp
+++ b/modules/calib3d/src/dls.cpp
@ -206,6 +206,7 @@ void dls::run_kernel(const cv::Mat& pp)

 void dls::build_coeff_matrix(const cv::Mat& pp, cv::Mat& Mtilde, cv::Mat& D)
 {
+    CV_Assert(!pp.empty());
    cv::Mat eye = cv::Mat::eye(3, 3, CV_64F);

    // build coeff matrix
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@ -126,7 +126,8 @@ void cv::fisheye::projectPoints(InputArray objectPoints, OutputArray imagePoints
    {
        Vec3d Xi = objectPoints.depth() == CV_32F ? (Vec3d)Xf[i] : Xd[i];
        Vec3d Y = aff*Xi;
-
+        if (fabs(Y[2]) < DBL_MIN)
+            Y[2] = 1;
        Vec2d x(Y[0]/Y[2], Y[1]/Y[2]);

        double r2 = x.dot(x);
@ -1186,6 +1187,7 @@ void cv::internal::ComputeExtrinsicRefine(const Mat& imagePoints, const Mat& obj
 {
    CV_Assert(!objectPoints.empty() && objectPoints.type() == CV_64FC3);
    CV_Assert(!imagePoints.empty() && imagePoints.type() == CV_64FC2);
+    CV_Assert(rvec.total() > 2 && tvec.total() > 2);
    Vec6d extrinsics(rvec.at<double>(0), rvec.at<double>(1), rvec.at<double>(2),
                    tvec.at<double>(0), tvec.at<double>(1), tvec.at<double>(2));
    double change = 1;
@ -1365,9 +1367,13 @@ void cv::internal::InitExtrinsics(const Mat& _imagePoints, const Mat& _objectPoi
    double sc = .5 * (norm(H.col(0)) + norm(H.col(1)));
    H = H / sc;
    Mat u1 = H.col(0).clone();
-    u1  = u1 / norm(u1);
+    double norm_u1 = norm(u1);
+    CV_Assert(fabs(norm_u1) > 0);
+    u1  = u1 / norm_u1;
    Mat u2 = H.col(1).clone() - u1.dot(H.col(1).clone()) * u1;
-    u2 = u2 / norm(u2);
+    double norm_u2 = norm(u2);
+    CV_Assert(fabs(norm_u2) > 0);
+    u2 = u2 / norm_u2;
    Mat u3 = u1.cross(u2);
    Mat RRR;
    hconcat(u1, u2, RRR);
--- a/modules/calib3d/src/homography_decomp.cpp
+++ b/modules/calib3d/src/homography_decomp.cpp
@ -194,6 +194,7 @@ void HomographyDecompZhang::decompose(std::vector<CameraMotion>& camMotions)
 {
    Mat W, U, Vt;
    SVD::compute(getHnorm(), W, U, Vt);
+    CV_Assert(W.total() > 2 && Vt.total() > 7);
    double lambda1=W.at<double>(0);
    double lambda3=W.at<double>(2);
    double lambda1m3 =  (lambda1-lambda3);
--- a/modules/calib3d/test/test_cameracalibration_badarg.cpp
+++ b/modules/calib3d/test/test_cameracalibration_badarg.cpp
@ -489,7 +489,14 @@ protected:
    void run(int /* start_from */ )
    {
        CvMat zeros;
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
        memset(&zeros, 0, sizeof(zeros));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif

        C_Caller caller, bad_caller;
        CvMat objectPoints_c, r_vec_c, t_vec_c, A_c, distCoeffs_c, imagePoints_c,
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1981,10 +1981,20 @@ CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                             OutputArray eigenvectors, int maxComponents = 0);

+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           int maxComponents = 0);
+
 /** wrap PCA::operator() */
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                             OutputArray eigenvectors, double retainedVariance);

+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           double retainedVariance);
+
 /** wrap PCA::project */
 CV_EXPORTS_W void PCAProject(InputArray data, InputArray mean,
                             InputArray eigenvectors, OutputArray result);
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -406,6 +406,24 @@ Cv64suf;
 #endif


+/****************************************************************************************\
+*                                  CV_NODISCARD attribute                                *
+* encourages the compiler to issue a warning if the return value is discarded (C++17)    *
+\****************************************************************************************/
+#ifndef CV_NODISCARD
+#  if defined(__GNUC__)
+#    define CV_NODISCARD __attribute__((__warn_unused_result__)) // at least available with GCC 3.4
+#  elif defined(__clang__) && defined(__has_attribute)
+#    if __has_attribute(__warn_unused_result__)
+#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#    endif
+#  endif
+#endif
+#ifndef CV_NODISCARD
+#  define CV_NODISCARD /* nothing by default */
+#endif
+
+
 /****************************************************************************************\
 *                                    C++ 11                                              *
 \****************************************************************************************/
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -60,255 +60,72 @@
 // access from within opencv code more accessible
 namespace cv {

-#ifndef CV_DOXYGEN
-
-#ifdef CV_CPU_DISPATCH_MODE
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#else
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#endif
-
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-#endif
-
-//! @addtogroup core_hal_intrin
-//! @{
-
-//! @cond IGNORED
 template<typename _Tp> struct V_TypeTraits
 {
-    typedef _Tp int_type;
-    typedef _Tp uint_type;
-    typedef _Tp abs_type;
-    typedef _Tp sum_type;
-
-    enum { delta = 0, shift = 0 };
-
-    static int_type reinterpret_int(_Tp x) { return x; }
-    static uint_type reinterpet_uint(_Tp x) { return x; }
-    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
-};
-
-template<> struct V_TypeTraits<uchar>
-{
-    typedef uchar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef ushort w_type;
-    typedef unsigned q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<schar>
-{
-    typedef schar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef short w_type;
-    typedef int q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<ushort>
-{
-    typedef ushort value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef unsigned w_type;
-    typedef uchar nu_type;
-
-    enum { delta = 32768, shift = 16 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<short>
-{
-    typedef short value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef int w_type;
-    typedef uchar nu_type;
-    typedef schar n_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<unsigned>
-{
-    typedef unsigned value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef unsigned sum_type;
-
-    typedef uint64 w_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int>
-{
-    typedef int value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef int sum_type;
-
-    typedef int64 w_type;
-    typedef short n_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<uint64>
-{
-    typedef uint64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef uint64 sum_type;
-
-    typedef unsigned nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int64>
-{
-    typedef int64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef int64 sum_type;
-
-    typedef int nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-
-template<> struct V_TypeTraits<float>
-{
-    typedef float value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef float abs_type;
-    typedef float sum_type;
-
-    typedef double w_type;
-
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.u;
-    }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv32suf u;
-        u.i = x;
-        return u.f;
-    }
 };

-template<> struct V_TypeTraits<double>
-{
-    typedef double value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef double abs_type;
-    typedef double sum_type;
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.u;
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+        enum { nlanes128 = nlanes128_ }; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
    }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv64suf u;
-        u.i = x;
-        return u.f;
-    }
-};

-template <typename T> struct V_SIMD128Traits
-{
-    enum { nlanes = 16 / sizeof(T) };
-};
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);

-//! @endcond
+#ifndef CV_DOXYGEN

-//! @}
+#ifdef CV_CPU_DISPATCH_MODE
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif

-#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
 }

 #ifdef CV_DOXYGEN
+#   undef CV_AVX2
 #   undef CV_SSE2
 #   undef CV_NEON
 #   undef CV_VSX
+#   undef CV_FP16
 #endif

 #if CV_SSE2
@ -325,27 +142,25 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 #else

+#define CV_SIMD128_CPP 1
 #include "opencv2/core/hal/intrin_cpp.hpp"

 #endif

-//! @addtogroup core_hal_intrin
-//! @{
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+#if CV_AVX2

-#ifndef CV_SIMD128
-//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
-#define CV_SIMD128 0
-#endif
+#include "opencv2/core/hal/intrin_avx.hpp"

-#ifndef CV_SIMD128_64F
-//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
-#define CV_SIMD128_64F 0
 #endif

-//! @}
-
-//==================================================================================================
-
 //! @cond IGNORED

 namespace cv {
@ -354,88 +169,175 @@ namespace cv {
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #endif

-template <typename R> struct V_RegTrait128;
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif

-template <> struct V_RegTrait128<uchar> {
-    typedef v_uint8x16 reg;
-    typedef v_uint16x8 w_reg;
-    typedef v_uint32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_uint8x16 zero() { return v_setzero_u8(); }
-    static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
-};
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif

-template <> struct V_RegTrait128<schar> {
-    typedef v_int8x16 reg;
-    typedef v_int16x8 w_reg;
-    typedef v_int32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_int8x16 zero() { return v_setzero_s8(); }
-    static v_int8x16 all(schar val) { return v_setall_s8(val); }
-};
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif

-template <> struct V_RegTrait128<ushort> {
-    typedef v_uint16x8 reg;
-    typedef v_uint32x4 w_reg;
-    typedef v_int16x8 int_reg;
-    typedef v_uint16x8 u_reg;
-    static v_uint16x8 zero() { return v_setzero_u16(); }
-    static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
-};
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif

-template <> struct V_RegTrait128<short> {
-    typedef v_int16x8 reg;
-    typedef v_int32x4 w_reg;
-    typedef v_uint16x8 u_reg;
-    static v_int16x8 zero() { return v_setzero_s16(); }
-    static v_int16x8 all(short val) { return v_setall_s16(val); }
-};
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif

-template <> struct V_RegTrait128<unsigned> {
-    typedef v_uint32x4 reg;
-    typedef v_uint64x2 w_reg;
-    typedef v_int32x4 int_reg;
-    typedef v_uint32x4 u_reg;
-    static v_uint32x4 zero() { return v_setzero_u32(); }
-    static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
-};
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif

-template <> struct V_RegTrait128<int> {
-    typedef v_int32x4 reg;
-    typedef v_int64x2 w_reg;
-    typedef v_uint32x4 u_reg;
-    static v_int32x4 zero() { return v_setzero_s32(); }
-    static v_int32x4 all(int val) { return v_setall_s32(val); }
-};
+#if CV_SIMD512
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+#elif CV_SIMD256
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_WIDTH 32
+#else
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+#endif

-template <> struct V_RegTrait128<uint64> {
-    typedef v_uint64x2 reg;
-    static v_uint64x2 zero() { return v_setzero_u64(); }
-    static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
-};
+//==================================================================================================

-template <> struct V_RegTrait128<int64> {
-    typedef v_int64x2 reg;
-    static v_int64x2 zero() { return v_setzero_s64(); }
-    static v_int64x2 all(int64 val) { return v_setall_s64(val); }
+#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
+    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
+    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
+    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
+    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
+inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+
+template<typename _Tp> struct V_RegTraits
+{
 };

-template <> struct V_RegTrait128<float> {
-    typedef v_float32x4 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float32x4 u_reg;
-    static v_float32x4 zero() { return v_setzero_f32(); }
-    static v_float32x4 all(float val) { return v_setall_f32(val); }
-};
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }

+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
 #if CV_SIMD128_64F
-template <> struct V_RegTrait128<double> {
-    typedef v_float64x2 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float64x2 u_reg;
-    static v_float64x2 zero() { return v_setzero_f64(); }
-    static v_float64x2 all(double val) { return v_setall_f64(val); }
-};
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
+#endif
+#endif
+
+#if CV_SIMD256
+    typedef v_uint8x32   v_uint8;
+    typedef v_int8x32    v_int8;
+    typedef v_uint16x16  v_uint16;
+    typedef v_int16x16   v_int16;
+    typedef v_uint32x8   v_uint32;
+    typedef v_int32x8    v_int32;
+    typedef v_uint64x4   v_uint64;
+    typedef v_int64x4    v_int64;
+    typedef v_float32x8  v_float32;
+    #if CV_SIMD256_64F
+    typedef v_float64x4  v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x16  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
+    inline void vx_cleanup() { v256_cleanup(); }
+#elif CV_SIMD128
+    typedef v_uint8x16  v_uint8;
+    typedef v_int8x16   v_int8;
+    typedef v_uint16x8  v_uint16;
+    typedef v_int16x8   v_int16;
+    typedef v_uint32x4  v_uint32;
+    typedef v_int32x4   v_int32;
+    typedef v_uint64x2  v_uint64;
+    typedef v_int64x2   v_int64;
+    typedef v_float32x4 v_float32;
+    #if CV_SIMD128_64F
+    typedef v_float64x2 v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x8  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
+    #if CV_SIMD128_64F
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
+    #endif
+    inline void vx_cleanup() { v_cleanup(); }
 #endif

 inline unsigned int trailingZeros32(unsigned int value) {
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -247,8 +247,6 @@ template<typename _Tp, int n> struct v_reg
 {
 //! @cond IGNORED
    typedef _Tp lane_type;
-    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
-    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
    enum { nlanes = n };
 // !@endcond

@ -797,11 +795,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>

 /** @brief Multiply and add

-Returns \f$ a*b + c \f$
-For floating point types and signed 32bit int only. */
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
 {
    v_reg<_Tp, n> d;
    for( int i = 0; i < n; i++ )
@ -809,6 +807,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    return d;
 }

+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
 /** @brief Dot product of elements

 Multiply values in two registers and sum adjacent result pairs.
@ -1141,9 +1147,9 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }

 /** @brief Load register contents from memory (aligned)
@ -1151,9 +1157,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }

 /** @brief Load 64-bits of data to lower part (high part is undefined).
@ -1166,9 +1172,9 @@ v_int32x4 r = v_load_low(lo);
@endcode
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
        c.s[i] = ptr[i];
@ -1187,9 +1193,9 @@ v_int32x4 r = v_load_halves(lo, hi);
@endcode
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
        c.s[i] = loptr[i];
@ -1208,11 +1214,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
 For 8-, 16-, 32-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
 v_load_expand(const _Tp* ptr)
 {
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
@ -1229,11 +1235,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
 For 8-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
 v_load_expand_q(const _Tp* ptr)
 {
    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
@ -1622,6 +1628,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
    return c;
 }

+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
 /** @brief Convert to double

 Supported input type is cv::v_int32x4. */
@ -1644,6 +1661,52 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
    return c;
 }

+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
 /** @brief Transpose 4x4 matrix

 Scheme:
@ -1968,6 +2031,8 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }

+inline void v_cleanup() {}
+
 //! @}

 //! @name Check SIMD support
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -280,11 +280,29 @@ struct v_float64x2

 #if CV_FP16
 // Workaround for old compilers
-template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
-{ return (int16x4_t)a; }
-template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
-{ return (float16x4_t)a; }
-template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
+static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
+static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
+static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
+static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
+
+static inline float16x8_t cv_vld1q_f16(const void* ptr)
+{
+#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
+    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
+#else
+    return vld1q_f16((const __fp16*)ptr);
+#endif
+}
+static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
+{
+#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
+    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
+#else
+    vst1q_f16((__fp16*)ptr, a);
+#endif
+}
+
+static inline float16x4_t cv_vld1_f16(const void* ptr)
 {
 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
    return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
@ -292,7 +310,7 @@ template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
    return vld1_f16((const __fp16*)ptr);
 #endif
 }
-template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
+static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 {
 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
    vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
@ -301,24 +319,28 @@ template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
 #endif
 }

-struct v_float16x4
+
+struct v_float16x8
 {
    typedef short lane_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };

-    v_float16x4() {}
-    explicit v_float16x4(float16x4_t v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() {}
+    explicit v_float16x8(float16x8_t v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
    {
-        short v[] = {v0, v1, v2, v3};
-        val = cv_vld1_f16(v);
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = cv_vld1q_f16(v);
    }
    short get0() const
    {
-        return vget_lane_s16(vreinterpret_s16_f16(val), 0);
+        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
    }
-    float16x4_t val;
+    float16x8_t val;
 };
+
+inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
+inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
 #endif

 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
@ -731,16 +753,32 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 }

-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
 }

-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
 }

+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@ -753,9 +791,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
    return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 }

+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
+    return v_fma(a, b, c);
 }
 #endif

@ -841,10 +884,15 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)

 #if CV_FP16
 // Workaround for old comiplers
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(cv_vld1_f16(ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ cv_vst1_f16(ptr, a.val); }
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
 #endif

 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@ -1293,6 +1341,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
 }

+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
@ -1315,17 +1368,88 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 #endif

 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
+{
+    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
+}
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
 {
-    return v_float32x4(vcvt_f32_f16(a.val));
+    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
 }

-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(vcvt_f16_f32(a.val));
+    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
 }
 #endif

+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -58,6 +58,17 @@ namespace cv

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_float32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float64x2;
+
 struct v_uint8x16
 {
    typedef uchar lane_type;
@ -144,6 +155,7 @@ struct v_int16x8
    {
        return (short)_mm_cvtsi128_si32(val);
    }
+
    __m128i val;
 };

@ -163,6 +175,7 @@ struct v_uint32x4
    {
        return (unsigned)_mm_cvtsi128_si32(val);
    }
+
    __m128i val;
 };

@ -182,6 +195,7 @@ struct v_int32x4
    {
        return _mm_cvtsi128_si32(val);
    }
+
    __m128i val;
 };

@ -201,6 +215,7 @@ struct v_float32x4
    {
        return _mm_cvtss_f32(val);
    }
+
    __m128 val;
 };

@ -222,6 +237,7 @@ struct v_uint64x2
        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
        return (unsigned)a | ((uint64)(unsigned)b << 32);
    }
+
    __m128i val;
 };

@ -243,6 +259,7 @@ struct v_int64x2
        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
    }
+
    __m128i val;
 };

@ -262,29 +279,31 @@ struct v_float64x2
    {
        return _mm_cvtsd_f64(val);
    }
+
    __m128d val;
 };

-#if CV_FP16
-struct v_float16x4
+struct v_float16x8
 {
    typedef short lane_type;
    typedef __m128i vector_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };

-    v_float16x4() : val(_mm_setzero_si128()) {}
-    explicit v_float16x4(__m128i v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() : val(_mm_setzero_si128()) {}
+    explicit v_float16x8(__m128i v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
    {
-        val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+        val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
    }
    short get0() const
    {
        return (short)_mm_cvtsi128_si32(val);
    }
+
    __m128i val;
 };
-#endif
+inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
+inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }

 namespace hal_sse_internal
 {
@ -697,11 +716,15 @@ inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
 }
 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_int32x4(_mm_mullo_epi32(a.val, b.val));
+#else
    __m128i c0 = _mm_mul_epu32(a.val, b.val);
    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+#endif
 }
 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
 {
@ -1027,11 +1050,35 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
    return a * b + c;
 }

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
@ -1040,17 +1087,16 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(_mm_sqrt_##suffix(res)); \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(res); \
+    return v_fma(a, a, b*b); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
-    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+    return v_fma(a, b, c); \
 }

 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
@ -1268,12 +1314,15 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)

-#if CV_FP16
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ _mm_storel_epi64((__m128i*)ptr, a.val); }
-#endif
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ _mm_storeu_si128((__m128i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ _mm_store_si128((__m128i*)ptr, a.val); }

 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
@ -2183,6 +2232,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
    return v_float32x4(_mm_cvtpd_ps(a.val));
 }

+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
    return v_float64x2(_mm_cvtepi32_pd(a.val));
@ -2200,21 +2254,82 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)

 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }

 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
 {
    return v_float32x4(_mm_cvtph_ps(a.val));
 }

-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
+{
+    return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
+}
+
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(_mm_cvtps_ph(a.val, 0));
+    return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
 }
 #endif

+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -764,6 +764,8 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }

@ -836,6 +838,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }

+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }

@ -848,6 +853,48 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }

+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+inline void v_cleanup() {}
+
+
 /** Reinterpret **/
 /** its up there with load and store operations **/

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -1165,7 +1165,7 @@ public:
    The method creates a full copy of the array. The original step[] is not taken into account. So, the
    array copy is a continuous array occupying total()*elemSize() bytes.
     */
-    Mat clone() const;
+    Mat clone() const CV_NODISCARD;

    /** @brief Copies the matrix to another one.

@ -2252,7 +2252,7 @@ public:
    Mat_ row(int y) const;
    Mat_ col(int x) const;
    Mat_ diag(int d=0) const;
-    Mat_ clone() const;
+    Mat_ clone() const CV_NODISCARD;

    //! overridden forms of Mat::elemSize() etc.
    size_t elemSize() const;
@ -2429,7 +2429,7 @@ public:
    static UMat diag(const UMat& d);

    //! returns deep copy of the matrix, i.e. the data is copied
-    UMat clone() const;
+    UMat clone() const CV_NODISCARD;
    //! copies the matrix content to "m".
    // It calls m.create(this->size(), this->type()).
    void copyTo( OutputArray m ) const;
@ -2722,7 +2722,7 @@ public:
    SparseMat& operator = (const Mat& m);

    //! creates full copy of the matrix
-    SparseMat clone() const;
+    SparseMat clone() const CV_NODISCARD;

    //! copies all the data to the destination matrix. All the previous content of m is erased
    void copyTo( SparseMat& m ) const;
@ -2959,7 +2959,7 @@ public:
    SparseMat_& operator = (const Mat& m);

    //! makes full copy of the matrix. All the elements are duplicated
-    SparseMat_ clone() const;
+    SparseMat_ clone() const CV_NODISCARD;
    //! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
    void create(int dims, const int* _sizes);
    //! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -847,7 +847,9 @@ bool Mat::isSubmatrix() const
 inline
 size_t Mat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }

 inline
@ -3760,7 +3762,9 @@ bool UMat::isSubmatrix() const
 inline
 size_t UMat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }

 inline
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -64,13 +64,14 @@ namespace cv
 ////////////////////////////// Small Matrix ///////////////////////////

 //! @cond IGNORED
-struct CV_EXPORTS Matx_AddOp {};
-struct CV_EXPORTS Matx_SubOp {};
-struct CV_EXPORTS Matx_ScaleOp {};
-struct CV_EXPORTS Matx_MulOp {};
-struct CV_EXPORTS Matx_DivOp {};
-struct CV_EXPORTS Matx_MatMulOp {};
-struct CV_EXPORTS Matx_TOp {};
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
 //! @endcond

 /** @brief Template class for small matrices whose type and size are known at compilation time
@ -116,7 +117,7 @@ public:
    //! default constructor
    Matx();

-    Matx(_Tp v0); //!< 1x1 matrix
+    explicit Matx(_Tp v0); //!< 1x1 matrix
    Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
    Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@ -61,29 +61,44 @@ namespace cv
 namespace internal
 {

-template<typename _Tp, int m> struct Matx_FastInvOp
+template<typename _Tp, int m, int n> struct Matx_FastInvOp
+{
+    bool operator()(const Matx<_Tp, m, n>& a, Matx<_Tp, n, m>& b, int method) const
+    {
+        return invert(a, b, method) != 0;
+    }
+};
+
+template<typename _Tp, int m> struct Matx_FastInvOp<_Tp, m, m>
 {
    bool operator()(const Matx<_Tp, m, m>& a, Matx<_Tp, m, m>& b, int method) const
    {
-        Matx<_Tp, m, m> temp = a;
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;

-        // assume that b is all 0's on input => make it a unity matrix
-        for( int i = 0; i < m; i++ )
-            b(i, i) = (_Tp)1;
+            // assume that b is all 0's on input => make it a unity matrix
+            for (int i = 0; i < m; i++)
+                b(i, i) = (_Tp)1;

-        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            if (method == DECOMP_CHOLESKY)
+                return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);

-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+            return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        }
+        else
+        {
+            return invert(a, b, method) != 0;
+        }
    }
 };

-template<typename _Tp> struct Matx_FastInvOp<_Tp, 2>
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 2, 2>
 {
-    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int) const
+    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int /*method*/) const
    {
        _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
            return false;
        d = 1/d;
        b(1,1) = a(0,0)*d;
@ -94,12 +109,12 @@ template<typename _Tp> struct Matx_FastInvOp<_Tp, 2>
    }
 };

-template<typename _Tp> struct Matx_FastInvOp<_Tp, 3>
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 3, 3>
 {
-    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int) const
+    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int /*method*/) const
    {
        _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
            return false;
        d = 1/d;
        b(0,0) = (a(1,1) * a(2,2) - a(1,2) * a(2,1)) * d;
@ -118,27 +133,43 @@ template<typename _Tp> struct Matx_FastInvOp<_Tp, 3>
 };


-template<typename _Tp, int m, int n> struct Matx_FastSolveOp
+template<typename _Tp, int m, int l, int n> struct Matx_FastSolveOp
+{
+    bool operator()(const Matx<_Tp, m, l>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, l, n>& x, int method) const
+    {
+        return cv::solve(a, b, x, method);
+    }
+};
+
+template<typename _Tp, int m, int n> struct Matx_FastSolveOp<_Tp, m, m, n>
 {
    bool operator()(const Matx<_Tp, m, m>& a, const Matx<_Tp, m, n>& b,
                    Matx<_Tp, m, n>& x, int method) const
    {
-        Matx<_Tp, m, m> temp = a;
-        x = b;
-        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+            x = b;
+            if( method == DECOMP_CHOLESKY )
+                return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);

-        return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+            return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+        }
+        else
+        {
+            return cv::solve(a, b, x, method);
+        }
    }
 };

-template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 1>
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 2, 1>
 {
    bool operator()(const Matx<_Tp, 2, 2>& a, const Matx<_Tp, 2, 1>& b,
                    Matx<_Tp, 2, 1>& x, int) const
    {
        _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
            return false;
        d = 1/d;
        x(0) = (b(0)*a(1,1) - b(1)*a(0,1))*d;
@ -147,13 +178,13 @@ template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 1>
    }
 };

-template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 1>
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 3, 1>
 {
    bool operator()(const Matx<_Tp, 3, 3>& a, const Matx<_Tp, 3, 1>& b,
                    Matx<_Tp, 3, 1>& x, int) const
    {
        _Tp d = (_Tp)determinant(a);
-        if( d == 0 )
+        if (d == 0)
            return false;
        d = 1/d;
        x(0) = d*(b(0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1)) -
@ -193,15 +224,8 @@ template<typename _Tp, int m, int n> inline
 Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
 {
    Matx<_Tp, n, m> b;
-    bool ok;
-    if( m == n && (method == DECOMP_LU || method == DECOMP_CHOLESKY) )
-        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*reinterpret_cast<const Matx<_Tp, m, m>*>(this), reinterpret_cast<Matx<_Tp, m, m>&>(b), method);
-    else
-    {
-        Mat A(*this, false), B(b, false);
-        ok = (invert(A, B, method) != 0);
-    }
-    if( NULL != p_is_ok ) { *p_is_ok = ok; }
+    bool ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
+    if (p_is_ok) *p_is_ok = ok;
    return ok ? b : Matx<_Tp, n, m>::zeros();
 }

@ -209,15 +233,7 @@ template<typename _Tp, int m, int n> template<int l> inline
 Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) const
 {
    Matx<_Tp, n, l> x;
-    bool ok;
-    if( method == DECOMP_LU || method == DECOMP_CHOLESKY )
-        ok = cv::internal::Matx_FastSolveOp<_Tp, m, l>()(*this, rhs, x, method);
-    else
-    {
-        Mat A(*this, false), B(rhs, false), X(x, false);
-        ok = cv::solve(A, B, X, method);
-    }
-
+    bool ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
    return ok ? x : Matx<_Tp, n, l>::zeros();
 }

--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@ -61,7 +61,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone,

    TEST_CYCLE()
    {
-        source.clone();
+        Mat tmp = source.clone();
+        (void)tmp;
    }
    destination = source.clone();

@ -88,7 +89,8 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,

    TEST_CYCLE()
    {
-        roi.clone();
+        Mat tmp = roi.clone();
+        (void)tmp;
    }
    destination = roi.clone();

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -1180,7 +1180,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
    CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
               op == CMP_NE || op == CMP_GE || op == CMP_GT );

-    if(_src1.empty() && _src2.empty())
+    if(_src1.empty() || _src2.empty())
    {
        _dst.release();
        return;
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@ -2916,12 +2916,29 @@ cvInitImageHeader( IplImage * image, CvSize size, int depth,
    if( !image )
        CV_Error( CV_HeaderIsNull, "null pointer to header" );

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
    memset( image, 0, sizeof( *image ));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
    image->nSize = sizeof( *image );

    icvGetColorModel( channels, &colorModel, &channelSeq );
-    strncpy( image->colorModel, colorModel, 4 );
-    strncpy( image->channelSeq, channelSeq, 4 );
+    for (int i = 0; i < 4; i++)
+    {
+        image->colorModel[i] = colorModel[i];
+        if (colorModel[i] == 0)
+            break;
+    }
+    for (int i = 0; i < 4; i++)
+    {
+        image->channelSeq[i] = channelSeq[i];
+        if (channelSeq[i] == 0)
+            break;
+    }

    if( size.width < 0 || size.height < 0 )
        CV_Error( CV_BadROISize, "Bad input roi" );
--- a/modules/core/src/batch_distance.cpp
+++ b/modules/core/src/batch_distance.cpp
@ -263,6 +263,7 @@ void cv::batchDistance( InputArray _src1, InputArray _src2,
    if( crosscheck )
    {
        CV_Assert( K == 1 && update == 0 && mask.empty() );
+        CV_Assert(!nidx.empty());
        Mat tdist, tidx;
        batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);

--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@ -44,7 +44,7 @@ static const char* getTestOpMath(unsigned testOp)
 const char* depthToString_(int depth)
 {
    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
-    return depth <= CV_USRTYPE1 ? depthNames[depth] : NULL;
+    return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
 }

 const cv::String typeToString_(int type)
--- a/modules/core/src/convert.fp16.cpp
+++ b/modules/core/src/convert.fp16.cpp
@ -81,10 +81,9 @@ void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t
        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
        {
            float32x4_t v_src = vld1q_f32(src + x);
-
            float16x4_t v_dst = vcvt_f16_f32(v_src);

-            cv_vst1_f16((__fp16*)dst + x, v_dst);
+            cv_vst1_f16(dst + x, v_dst);
        }

        for ( ; x < size.width; x++ )
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -411,6 +411,8 @@ Mat& Mat::operator = (const Scalar& s)
 {
    CV_INSTRUMENT_REGION()

+    if (empty()) return *this;
+
    const Mat* arrays[] = { this };
    uchar* dptr;
    NAryMatIterator it(arrays, &dptr, 1);
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -1100,6 +1100,9 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    CV_Assert( type == _src2.type() && (type == CV_32F || type == CV_64F) );

    method &= ~DECOMP_NORMAL;
+    CV_Check(method, method == DECOMP_LU || method == DECOMP_SVD || method == DECOMP_EIG ||
+                     method == DECOMP_CHOLESKY || method == DECOMP_QR,
+             "Unsupported method, see #DecompTypes");
    CV_Assert( (method != DECOMP_LU && method != DECOMP_CHOLESKY) ||
        is_normal || src.rows == src.cols );

--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
--- a/modules/core/src/pca.cpp
+++ b/modules/core/src/pca.cpp
@ -360,6 +360,19 @@ void cv::PCACompute(InputArray data, InputOutputArray mean,
    pca.eigenvectors.copyTo(eigenvectors);
 }

+void cv::PCACompute(InputArray data, InputOutputArray mean,
+                    OutputArray eigenvectors, OutputArray eigenvalues,
+                    int maxComponents)
+{
+    CV_INSTRUMENT_REGION()
+
+    PCA pca;
+    pca(data, mean, 0, maxComponents);
+    pca.mean.copyTo(mean);
+    pca.eigenvectors.copyTo(eigenvectors);
+    pca.eigenvalues.copyTo(eigenvalues);
+}
+
 void cv::PCACompute(InputArray data, InputOutputArray mean,
                    OutputArray eigenvectors, double retainedVariance)
 {
@ -371,6 +384,19 @@ void cv::PCACompute(InputArray data, InputOutputArray mean,
    pca.eigenvectors.copyTo(eigenvectors);
 }

+void cv::PCACompute(InputArray data, InputOutputArray mean,
+                    OutputArray eigenvectors, OutputArray eigenvalues,
+                    double retainedVariance)
+{
+    CV_INSTRUMENT_REGION()
+
+    PCA pca;
+    pca(data, mean, 0, retainedVariance);
+    pca.mean.copyTo(mean);
+    pca.eigenvectors.copyTo(eigenvectors);
+    pca.eigenvalues.copyTo(eigenvalues);
+}
+
 void cv::PCAProject(InputArray data, InputArray mean,
                    InputArray eigenvectors, OutputArray result)
 {
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@ -511,6 +511,8 @@ static RandnScaleFunc randnScaleTab[] =
 void RNG::fill( InputOutputArray _mat, int disttype,
                InputArray _param1arg, InputArray _param2arg, bool saturateRange )
 {
+    if (_mat.empty())
+        return;
    Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
    int depth = mat.depth(), cn = mat.channels();
    AutoBuffer<double> _parambuf;
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -241,9 +241,9 @@ TEST(hal_intrin, float64x2) {
 }
 #endif

-TEST(hal_intrin,float16x4)
+TEST(hal_intrin,float16)
 {
-    CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ());
+    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }

--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@ -7,9 +7,9 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

-void test_hal_intrin_float16x4()
+void test_hal_intrin_float16()
 {
-    TheTest<v_float16x4>()
+    TheTest<v_float16x8>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        ;
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -6,7 +6,7 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

-void test_hal_intrin_float16x4();
+void test_hal_intrin_float16();

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

@ -50,6 +50,8 @@ template <> struct initializer<2>
 template <typename R> struct Data
 {
    typedef typename R::lane_type LaneType;
+    typedef typename V_TypeTraits<LaneType>::int_type int_type;
+
    Data()
    {
        for (int i = 0; i < R::nlanes; ++i)
@ -104,6 +106,17 @@ template <typename R> struct Data
        CV_Assert(i >= 0 && i < R::nlanes);
        return d[i];
    }
+    int_type as_int(int i) const
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        union
+        {
+            LaneType l;
+            int_type i;
+        } v;
+        v.l = d[i];
+        return v.i;
+    }
    const LaneType * mid() const
    {
        return d + R::nlanes / 2;
@ -247,8 +260,9 @@ template<typename R> struct TheTest
        EXPECT_EQ(d, res);

        // zero, all
-        Data<R> resZ = V_RegTrait128<LaneType>::zero();
-        Data<R> resV = V_RegTrait128<LaneType>::all(8);
+        Data<R> resZ, resV;
+        resZ.fill((LaneType)0);
+        resV.fill((LaneType)8);
        for (int i = 0; i < R::nlanes; ++i)
        {
            EXPECT_EQ((LaneType)0, resZ[i]);
@ -339,7 +353,7 @@ template<typename R> struct TheTest
    // v_expand and v_load_expand
    TheTest & test_expand()
    {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
        Data<R> dataA;
        R a = dataA;

@ -362,7 +376,7 @@ template<typename R> struct TheTest

    TheTest & test_expand_q()
    {
-        typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
+        typedef typename V_RegTraits<R>::q_reg Rx4;
        Data<R> data;
        Data<Rx4> out = v_load_expand_q(data.d);
        const int n = Rx4::nlanes;
@ -436,7 +450,7 @@ template<typename R> struct TheTest

    TheTest & test_mul_expand()
    {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
        Data<R> dataA, dataB(2);
        R a = dataA, b = dataB;
        Rx2 c, d;
@ -456,7 +470,7 @@ template<typename R> struct TheTest

    TheTest & test_abs()
    {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
        typedef typename Ru::lane_type u_type;
        Data<R> dataA, dataB(10);
        R a = dataA, b = dataB;
@ -520,7 +534,7 @@ template<typename R> struct TheTest

    TheTest & test_dot_prod()
    {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
        typedef typename Rx2::lane_type w_type;

        Data<R> dataA, dataB(2);
@ -608,7 +622,7 @@ template<typename R> struct TheTest

    TheTest & test_absdiff()
    {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
        typedef typename Ru::lane_type u_type;
        Data<R> dataA(std::numeric_limits<LaneType>::max()),
                dataB(std::numeric_limits<LaneType>::min());
@ -657,12 +671,21 @@ template<typename R> struct TheTest

    TheTest & test_mask()
    {
-        typedef V_TypeTraits<LaneType> Traits;
-        typedef typename Traits::int_type int_type;
+        typedef typename V_RegTraits<R>::int_reg int_reg;
+        typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
+        typedef typename int_reg::lane_type int_type;
+        typedef typename uint_reg::lane_type uint_type;

        Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
        dataA[1] *= (LaneType)-1;
-        const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
+        union
+        {
+            LaneType l;
+            uint_type ui;
+        }
+        all1s;
+        all1s.ui = (uint_type)-1;
+        LaneType mask_one = all1s.l;
        dataB[1] = mask_one;
        dataB[R::nlanes / 2] = mask_one;
        dataB[R::nlanes - 1] = mask_one;
@ -684,10 +707,8 @@ template<typename R> struct TheTest
        Data<R> resF = f;
        for (int i = 0; i < R::nlanes; ++i)
        {
-            int_type m2 = Traits::reinterpret_int(dataB[i]);
-            EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
-                    | (Traits::reinterpret_int(dataE[i]) & ~m2),
-                      Traits::reinterpret_int(resF[i]));
+            int_type m2 = dataB.as_int(i);
+            EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
        }

        return *this;
@ -697,7 +718,7 @@ template<typename R> struct TheTest
    TheTest & test_pack()
    {
        SCOPED_TRACE(s);
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
        typedef typename Rx2::lane_type w_type;
        Data<Rx2> dataA, dataB;
        dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
@ -734,8 +755,9 @@ template<typename R> struct TheTest
    TheTest & test_pack_u()
    {
        SCOPED_TRACE(s);
-        typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
-        typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
+        //typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
+        typedef typename V_RegTraits<R>::w_reg R2;
+        typedef typename V_RegTraits<R2>::int_reg Ri2;
        typedef typename Ri2::lane_type w_type;

        Data<Ri2> dataA, dataB;
@ -864,7 +886,7 @@ template<typename R> struct TheTest

    TheTest & test_float_math()
    {
-        typedef typename V_RegTrait128<LaneType>::int_reg Ri;
+        typedef typename V_RegTraits<R>::round_reg Ri;
        Data<R> data1, data2, data3;
        data1 *= 1.1;
        data2 += 10;
@ -1005,31 +1027,28 @@ template<typename R> struct TheTest

    TheTest & test_loadstore_fp16()
    {
-#if CV_FP16 && CV_SIMD128
+#if CV_FP16 && CV_SIMD
        AlignedData<R> data;
        AlignedData<R> out;

-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */ )
-        {
-            // check if addresses are aligned and unaligned respectively
-            EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
-            EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
-
-            // check some initialization methods
-            R r1 = data.u;
-            R r2 = v_load_f16(data.a.d);
-            R r3(r2);
-            EXPECT_EQ(data.u[0], r1.get0());
-            EXPECT_EQ(data.a[0], r2.get0());
-            EXPECT_EQ(data.a[0], r3.get0());
-
-            // check some store methods
-            out.a.clear();
-            v_store_f16(out.a.d, r1);
-            EXPECT_EQ(data.a, out.a);
-        }
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.u;
+        R r2 = v_load_f16(data.a.d);
+        R r3(r2);
+        EXPECT_EQ(data.u[0], r1.get0());
+        EXPECT_EQ(data.a[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+
+        // check some store methods
+        out.a.clear();
+        v_store(out.a.d, r1);
+        EXPECT_EQ(data.a, out.a);

        return *this;
 #endif
@ -1037,18 +1056,15 @@ template<typename R> struct TheTest

    TheTest & test_float_cvt_fp16()
    {
-#if CV_FP16 && CV_SIMD128
-        AlignedData<v_float32x4> data;
-
-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
-        {
-            // check conversion
-            v_float32x4 r1 = v_load(data.a.d);
-            v_float16x4 r2 = v_cvt_f16(r1);
-            v_float32x4 r3 = v_cvt_f32(r2);
-            EXPECT_EQ(0x3c00, r2.get0());
-            EXPECT_EQ(r3.get0(), r1.get0());
-        }
+#if CV_FP16 && CV_SIMD
+        AlignedData<v_float32> data;
+
+        // check conversion
+        v_float32 r1 = vx_load(data.a.d);
+        v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
+        v_float32 r3 = v_cvt_f32(r2);
+        EXPECT_EQ(0x3c00, r2.get0());
+        EXPECT_EQ(r3.get0(), r1.get0());

        return *this;
 #endif
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@ -134,7 +134,9 @@ double Core_PowTest::get_success_error_level( int test_case_idx, int i, int j )
    if( depth < CV_32F )
        return power == cvRound(power) && power >= 0 ? 0 : 1;
    else
-        return Base::get_success_error_level( test_case_idx, i, j );
+    {
+        return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1;
+    }
 }


@ -3129,6 +3131,75 @@ TEST(Core_QR_Solver, accuracy64f)
    ASSERT_FALSE(solve(A, B, solutionQR, DECOMP_QR));
 }

+TEST(Core_Solve, regression_11888)
+{
+    cv::Matx<float, 3, 2> A(
+        2, 1,
+        3, 1,
+        6, 1
+    );
+    cv::Vec<float, 3> b(4, 5, 7);
+    cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 0.001);
+    cv::Matx<float, 2, 3> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+    EXPECT_ANY_THROW({
+       /*cv::Matx<float, 2, 1> xLU =*/ A.solve(b, DECOMP_LU);
+       std::cout << "FATAL ERROR" << std::endl;
+    });
+}
+
+TEST(Core_Solve, Matx_2_2)
+{
+    cv::Matx<float, 2, 2> A(
+        2, 1,
+        1, 1
+    );
+    cv::Vec<float, 2> b(4, 5);
+    cv::Matx<float, 2, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 2, 2> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+}
+TEST(Core_Solve, Matx_3_3)
+{
+    cv::Matx<float, 3, 3> A(
+        2, 1, 0,
+        0, 1, 1,
+        1, 0, 1
+    );
+    cv::Vec<float, 3> b(4, 5, 6);
+    cv::Matx<float, 3, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 3, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 3, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 3, 3> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 3, 3>::eye(), NORM_L2), 1e-3);
+}
+
+TEST(Core_Solve, Matx_4_4)
+{
+    cv::Matx<float, 4, 4> A(
+        2, 1, 0, 4,
+        0, 1, 1, 3,
+        1, 0, 1, 2,
+        2, 2, 0, 1
+    );
+    cv::Vec<float, 4> b(4, 5, 6, 7);
+    cv::Matx<float, 4, 1> xLU = A.solve(b, DECOMP_LU);
+    cv::Matx<float, 4, 1> xQR = A.solve(b, DECOMP_QR);
+    cv::Matx<float, 4, 1> xSVD = A.solve(b, DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+    EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+    cv::Matx<float, 4, 4> iA = A.inv(DECOMP_SVD);
+    EXPECT_LE(cvtest::norm(iA*A, Matx<float, 4, 4>::eye(), NORM_L2), 1e-3);
+}
+
 softdouble naiveExp(softdouble x)
 {
    int exponent = x.getExp();
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@ -794,13 +794,13 @@ bool CV_OperationsTest::TestTemplateMat()

        Size size(2, 5);
        TestType<float>(size, 1.f);
-        cv::Vec3f val1 = 1.f;
+        cv::Vec3f val1(1.f);
        TestType<cv::Vec3f>(size, val1);
-        cv::Matx31f val2 = 1.f;
+        cv::Matx31f val2(1.f);
        TestType<cv::Matx31f>(size, val2);
-        cv::Matx41f val3 = 1.f;
+        cv::Matx41f val3(1.f);
        TestType<cv::Matx41f>(size, val3);
-        cv::Matx32f val4 = 1.f;
+        cv::Matx32f val4(1.f);
        TestType<cv::Matx32f>(size, val4);
    }
    catch (const test_excep& e)
--- a/modules/core/test/test_rand.cpp
+++ b/modules/core/test/test_rand.cpp
@ -168,11 +168,12 @@ void Core_RandTest::run( int )
        {
            tested_rng = saved_rng;
            int sz = 0, dsz = 0, slice;
-            for( slice = 0; slice < maxSlice; slice++, sz += dsz )
+            for( slice = 0; slice < maxSlice && sz < SZ; slice++, sz += dsz )
            {
-                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz + 1)) : SZ - sz;
+                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
                Mat aslice = arr[k].colRange(sz, sz + dsz);
                tested_rng.fill(aslice, dist_type, A, B);
+                printf("%d - %d\n", sz, sz + dsz);
            }
        }

--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -644,6 +644,24 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    */
    CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());

+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @returns Net object.
+     */
+    CV_EXPORTS_W Net readNetFromDarknet(const std::vector<uchar>& bufferCfg,
+                                        const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param lenCfg      Number of bytes to read from bufferCfg
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @param lenModel    Number of bytes to read from bufferModel
+     *  @returns Net object.
+     */
+    CV_EXPORTS Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg,
+                                      const char *bufferModel = NULL, size_t lenModel = 0);
+
    /** @brief Reads a network model stored in <a href="http://caffe.berkeleyvision.org">Caffe</a> framework's format.
      * @param prototxt   path to the .prototxt file with text description of the network architecture.
      * @param caffeModel path to the .caffemodel file with learned network.
@ -651,6 +669,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
      */
    CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());

+    /** @brief Reads a network model stored in Caffe model in memory.
+      * @param bufferProto buffer containing the content of the .prototxt file
+      * @param bufferModel buffer containing the content of the .caffemodel file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromCaffe(const std::vector<uchar>& bufferProto,
+                                      const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
    /** @brief Reads a network model stored in Caffe model in memory.
      * @details This is an overloaded member function, provided for convenience.
      * It differs from the above function only in what argument(s) it accepts.
@ -672,6 +698,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
      */
    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());

+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
+      * @param bufferModel buffer containing the content of the pb file
+      * @param bufferConfig buffer containing the content of the pbtxt file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTensorflow(const std::vector<uchar>& bufferModel,
+                                           const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
      * @details This is an overloaded member function, provided for convenience.
      * It differs from the above function only in what argument(s) it accepts.
@ -735,6 +769,18 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
      */
     CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");

+     /**
+      * @brief Read deep learning network represented in one of the supported formats.
+      * @details This is an overloaded member function, provided for convenience.
+      *          It differs from the above function only in what argument(s) it accepts.
+      * @param[in] framework    Name of origin framework.
+      * @param[in] bufferModel  A buffer with a content of binary file with weights
+      * @param[in] bufferConfig A buffer with a content of text file contains network configuration.
+      * @returns Net object.
+      */
+     CV_EXPORTS_W Net readNet(const String& framework, const std::vector<uchar>& bufferModel,
+                              const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
    /** @brief Loads blob which was serialized as torch.Tensor object of Torch7 framework.
     *  @warning This function has the same limitations as readNetFromTorch().
     */
--- a/modules/dnn/misc/java/test/DnnTensorFlowTest.java
+++ b/modules/dnn/misc/java/test/DnnTensorFlowTest.java
@ -1,10 +1,14 @@
 package org.opencv.test.dnn;

 import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.opencv.core.Core;
 import org.opencv.core.Mat;
+import org.opencv.core.MatOfFloat;
+import org.opencv.core.MatOfByte;
 import org.opencv.core.Scalar;
 import org.opencv.core.Size;
 import org.opencv.dnn.DictValue;
@ -26,6 +30,15 @@ public class DnnTensorFlowTest extends OpenCVTestCase {

    Net net;

+    private static void normAssert(Mat ref, Mat test) {
+        final double l1 = 1e-5;
+        final double lInf = 1e-4;
+        double normL1 = Core.norm(ref, test, Core.NORM_L1) / ref.total();
+        double normLInf = Core.norm(ref, test, Core.NORM_INF) / ref.total();
+        assertTrue(normL1 < l1);
+        assertTrue(normLInf < lInf);
+    }
+
    @Override
    protected void setUp() throws Exception {
        super.setUp();
@ -46,7 +59,7 @@ public class DnnTensorFlowTest extends OpenCVTestCase {

        File testDataPath = new File(envTestDataPath);

-        File f = new File(testDataPath, "dnn/space_shuttle.jpg");
+        File f = new File(testDataPath, "dnn/grace_hopper_227.png");
        sourceImageFile = f.toString();
        if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);

@ -77,31 +90,55 @@ public class DnnTensorFlowTest extends OpenCVTestCase {

    }

-    public void testTestNetForward() {
-        Mat rawImage = Imgcodecs.imread(sourceImageFile);
-
-        assertNotNull("Loading image from file failed!", rawImage);
+    public void checkInceptionNet(Net net)
+    {
+        Mat image = Imgcodecs.imread(sourceImageFile);
+        assertNotNull("Loading image from file failed!", image);

-        Mat image = new Mat();
-        Imgproc.resize(rawImage, image, new Size(224,224));
-
-        Mat inputBlob = Dnn.blobFromImage(image);
+        Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
        assertNotNull("Converting image to blob failed!", inputBlob);

-        Mat inputBlobP = new Mat();
-        Core.subtract(inputBlob, new Scalar(117.0), inputBlobP);
-
-        net.setInput(inputBlobP, "input" );
-
-        Mat result = net.forward();
+        net.setInput(inputBlob, "input");

+        Mat result = new Mat();
+        try {
+            net.setPreferableBackend(Dnn.DNN_BACKEND_OPENCV);
+            result = net.forward("softmax2");
+        }
+        catch (Exception e) {
+            fail("DNN forward failed: " + e.getMessage());
+        }
        assertNotNull("Net returned no result!", result);

-        Core.MinMaxLocResult minmax = Core.minMaxLoc(result.reshape(1, 1));
+        result = result.reshape(1, 1);
+        Core.MinMaxLocResult minmax = Core.minMaxLoc(result);
+        assertEquals("Wrong prediction", (int)minmax.maxLoc.x, 866);
+
+        Mat top5RefScores = new MatOfFloat(new float[] {
+            0.63032645f, 0.2561979f, 0.032181446f, 0.015721032f, 0.014785315f
+        }).reshape(1, 1);

-        assertTrue("No image recognized!", minmax.maxVal > 0.9);
+        Core.sort(result, result, Core.SORT_DESCENDING);

+        normAssert(result.colRange(0, 5), top5RefScores);
+    }

+    public void testTestNetForward() {
+        checkInceptionNet(net);
    }

+    public void testReadFromBuffer() {
+        File modelFile = new File(modelFileName);
+        byte[] modelBuffer = new byte[ (int)modelFile.length() ];
+
+        try {
+            FileInputStream fis = new FileInputStream(modelFile);
+            fis.read(modelBuffer);
+            fis.close();
+        } catch (IOException e) {
+            fail("Failed to read a model: " + e.getMessage());
+        }
+        net = Dnn.readNetFromTensorflow(new MatOfByte(modelBuffer));
+        checkInceptionNet(net);
+    }
 }
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@ -453,6 +453,15 @@ Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
    return net;
 }

+Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
+                            bufferModelPtr, bufferModel.size());
+}
+
 #endif //HAVE_PROTOBUF

 CV__DNN_EXPERIMENTAL_NS_END
--- a/modules/dnn/src/darknet/darknet_importer.cpp
+++ b/modules/dnn/src/darknet/darknet_importer.cpp
@ -44,6 +44,7 @@
 #include "../precomp.hpp"

 #include <iostream>
+#include <fstream>
 #include <algorithm>
 #include <vector>
 #include <map>
@ -66,14 +67,19 @@ public:

    DarknetImporter() {}

-    DarknetImporter(const char *cfgFile, const char *darknetModel)
+    DarknetImporter(std::istream &cfgStream, std::istream &darknetModelStream)
    {
        CV_TRACE_FUNCTION();

-        ReadNetParamsFromCfgFileOrDie(cfgFile, &net);
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
+        ReadNetParamsFromBinaryStreamOrDie(darknetModelStream, &net);
+    }

-        if (darknetModel && darknetModel[0])
-            ReadNetParamsFromBinaryFileOrDie(darknetModel, &net);
+    DarknetImporter(std::istream &cfgStream)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
    }

    struct BlobNote
@ -175,15 +181,75 @@ public:
    }
 };

+static Net readNetFromDarknet(std::istream &cfgFile, std::istream &darknetModel)
+{
+    Net net;
+    DarknetImporter darknetImporter(cfgFile, darknetModel);
+    darknetImporter.populateNet(net);
+    return net;
 }

-Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= String()*/)
+static Net readNetFromDarknet(std::istream &cfgFile)
 {
-    DarknetImporter darknetImporter(cfgFile.c_str(), darknetModel.c_str());
    Net net;
+    DarknetImporter darknetImporter(cfgFile);
    darknetImporter.populateNet(net);
    return net;
 }

+}
+
+Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= String()*/)
+{
+    std::ifstream cfgStream(cfgFile.c_str());
+    if (!cfgStream.is_open())
+    {
+        CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+    }
+    if (darknetModel != String())
+    {
+        std::ifstream darknetModelStream(darknetModel.c_str(), std::ios::binary);
+        if (!darknetModelStream.is_open())
+        {
+            CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(darknetModel));
+        }
+        return readNetFromDarknet(cfgStream, darknetModelStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+struct BufferStream : public std::streambuf
+{
+    BufferStream(const char* s, std::size_t n)
+    {
+        char* ptr = const_cast<char*>(s);
+        setg(ptr, ptr, ptr + n);
+    }
+};
+
+Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg, const char *bufferModel, size_t lenModel)
+{
+    BufferStream cfgBufferStream(bufferCfg, lenCfg);
+    std::istream cfgStream(&cfgBufferStream);
+    if (lenModel)
+    {
+        BufferStream weightsBufferStream(bufferModel, lenModel);
+        std::istream weightsStream(&weightsBufferStream);
+        return readNetFromDarknet(cfgStream, weightsStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+Net readNetFromDarknet(const std::vector<uchar>& bufferCfg, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferCfgPtr = reinterpret_cast<const char*>(&bufferCfg[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromDarknet(bufferCfgPtr, bufferCfg.size(),
+                              bufferModelPtr, bufferModel.size());
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }} // namespace
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -476,68 +476,61 @@ namespace cv {
                return dst;
            }

-            bool ReadDarknetFromCfgFile(const char *cfgFile, NetParameter *net)
+            bool ReadDarknetFromCfgStream(std::istream &ifile, NetParameter *net)
            {
-                std::ifstream ifile;
-                ifile.open(cfgFile);
-                if (ifile.is_open())
-                {
-                    bool read_net = false;
-                    int layers_counter = -1;
-                    for (std::string line; std::getline(ifile, line);) {
-                        line = escapeString(line);
-                        if (line.empty()) continue;
-                        switch (line[0]) {
-                        case '\0': break;
-                        case '#': break;
-                        case ';': break;
-                        case '[':
-                            if (line == "[net]") {
-                                read_net = true;
-                            }
-                            else {
-                                // read section
-                                read_net = false;
-                                ++layers_counter;
-                                const size_t layer_type_size = line.find("]") - 1;
-                                CV_Assert(layer_type_size < line.size());
-                                std::string layer_type = line.substr(1, layer_type_size);
-                                net->layers_cfg[layers_counter]["type"] = layer_type;
-                            }
-                            break;
-                        default:
-                            // read entry
-                            const size_t separator_index = line.find('=');
-                            CV_Assert(separator_index < line.size());
-                            if (separator_index != std::string::npos) {
-                                std::string name = line.substr(0, separator_index);
-                                std::string value = line.substr(separator_index + 1, line.size() - (separator_index + 1));
-                                name = escapeString(name);
-                                value = escapeString(value);
-                                if (name.empty() || value.empty()) continue;
-                                if (read_net)
-                                    net->net_cfg[name] = value;
-                                else
-                                    net->layers_cfg[layers_counter][name] = value;
-                            }
+                bool read_net = false;
+                int layers_counter = -1;
+                for (std::string line; std::getline(ifile, line);) {
+                    line = escapeString(line);
+                    if (line.empty()) continue;
+                    switch (line[0]) {
+                    case '\0': break;
+                    case '#': break;
+                    case ';': break;
+                    case '[':
+                        if (line == "[net]") {
+                            read_net = true;
+                        }
+                        else {
+                            // read section
+                            read_net = false;
+                            ++layers_counter;
+                            const size_t layer_type_size = line.find("]") - 1;
+                            CV_Assert(layer_type_size < line.size());
+                            std::string layer_type = line.substr(1, layer_type_size);
+                            net->layers_cfg[layers_counter]["type"] = layer_type;
+                        }
+                        break;
+                    default:
+                        // read entry
+                        const size_t separator_index = line.find('=');
+                        CV_Assert(separator_index < line.size());
+                        if (separator_index != std::string::npos) {
+                            std::string name = line.substr(0, separator_index);
+                            std::string value = line.substr(separator_index + 1, line.size() - (separator_index + 1));
+                            name = escapeString(name);
+                            value = escapeString(value);
+                            if (name.empty() || value.empty()) continue;
+                            if (read_net)
+                                net->net_cfg[name] = value;
+                            else
+                                net->layers_cfg[layers_counter][name] = value;
                        }
                    }
-
-                    std::string anchors = net->layers_cfg[net->layers_cfg.size() - 1]["anchors"];
-                    std::vector<float> vec = getNumbers<float>(anchors);
-                    std::map<std::string, std::string> &net_params = net->net_cfg;
-                    net->width = getParam(net_params, "width", 416);
-                    net->height = getParam(net_params, "height", 416);
-                    net->channels = getParam(net_params, "channels", 3);
-                    CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);
                }
-                else
-                    return false;
+
+                std::string anchors = net->layers_cfg[net->layers_cfg.size() - 1]["anchors"];
+                std::vector<float> vec = getNumbers<float>(anchors);
+                std::map<std::string, std::string> &net_params = net->net_cfg;
+                net->width = getParam(net_params, "width", 416);
+                net->height = getParam(net_params, "height", 416);
+                net->channels = getParam(net_params, "channels", 3);
+                CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);

                int current_channels = net->channels;
                net->out_channels_vec.resize(net->layers_cfg.size());

-                int layers_counter = -1;
+                layers_counter = -1;

                setLayersParams setParams(net);

@ -676,13 +669,8 @@ namespace cv {
                return true;
            }

-
-            bool ReadDarknetFromWeightsFile(const char *darknetModel, NetParameter *net)
+            bool ReadDarknetFromWeightsStream(std::istream &ifile, NetParameter *net)
            {
-                std::ifstream ifile;
-                ifile.open(darknetModel, std::ios::binary);
-                CV_Assert(ifile.is_open());
-
                int32_t major_ver, minor_ver, revision;
                ifile.read(reinterpret_cast<char *>(&major_ver), sizeof(int32_t));
                ifile.read(reinterpret_cast<char *>(&minor_ver), sizeof(int32_t));
@ -778,19 +766,18 @@ namespace cv {
        }


-        void ReadNetParamsFromCfgFileOrDie(const char *cfgFile, darknet::NetParameter *net)
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
        {
-            if (!darknet::ReadDarknetFromCfgFile(cfgFile, net)) {
-                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+            if (!darknet::ReadDarknetFromCfgStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
            }
        }

-        void ReadNetParamsFromBinaryFileOrDie(const char *darknetModel, darknet::NetParameter *net)
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
        {
-            if (!darknet::ReadDarknetFromWeightsFile(darknetModel, net)) {
-                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(darknetModel));
+            if (!darknet::ReadDarknetFromWeightsStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
            }
        }
-
    }
 }
--- a/modules/dnn/src/darknet/darknet_io.hpp
+++ b/modules/dnn/src/darknet/darknet_io.hpp
@ -109,10 +109,9 @@ namespace cv {
            };
        }

-        // Read parameters from a file into a NetParameter message.
-        void ReadNetParamsFromCfgFileOrDie(const char *cfgFile, darknet::NetParameter *net);
-        void ReadNetParamsFromBinaryFileOrDie(const char *darknetModel, darknet::NetParameter *net);
-
+        // Read parameters from a stream into a NetParameter message.
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
    }
 }
 #endif
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1492,7 +1492,8 @@ struct Net::Impl
            // TODO: OpenCL target support more fusion styles.
            if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
+                 ld.layerInstance->type != "Concat")) )
                continue;

            Ptr<Layer>& currLayer = ld.layerInstance;
@ -1701,6 +1702,31 @@ struct Net::Impl
                ld.outputBlobs.size() == 1 )
            {
                Mat& output = ld.outputBlobs[0];
+                UMat umat_output;
+                if (!ld.outputBlobsWrappers.empty() &&
+                    (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    bool conv_layer = true;
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skip &&
+                              inp_i_data->inputBlobsId.size() == 1 &&
+                              inp_i_data->consumers.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
+                    }
+                    if (!conv_layer)
+                        continue;
+                    std::vector<UMat> umat_outputBlobs;
+                    umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+                    umat_output = umat_outputBlobs[0];
+                }

                // TODO: in general, this optimization can always be done, but
                // many layers currently check that the input/output blobs are
@ -1737,6 +1763,14 @@ struct Net::Impl
                        // Allocate new memory to prevent collisions during memory
                        // reusing (see https://github.com/opencv/opencv/pull/10456).
                        output = output.clone();
+                        if (preferableBackend == DNN_BACKEND_OPENCV &&
+                            IS_DNN_OPENCL_TARGET(preferableTarget))
+                        {
+                            std::vector<UMat> umats(1);
+                            umat_output = umat_output.clone();
+                            umats[0] = umat_output;
+                            OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
+                        }
                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
                        int ofs = 0;
                        for( i = 0; i < ninputs; i++ )
@ -1753,6 +1787,12 @@ struct Net::Impl
                            CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
                            Mat* oldPtr = &curr_output;
                            curr_output = output_slice;
+                            if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
+                            {
+                                std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
+                                umats[pin.oid] = umat_output(chrange);
+                                OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
+                            }
                            // Layers that refer old input Mat will refer to the
                            // new data but the same Mat object.
                            CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output);
@ -3086,6 +3126,23 @@ Net readNet(const String& _model, const String& _config, const String& _framewor
                                      model + (config.empty() ? "" : ", " + config));
 }

+Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
+            const std::vector<uchar>& bufferConfig)
+{
+    String framework = _framework.toLowerCase();
+    if (framework == "caffe")
+        return readNetFromCaffe(bufferConfig, bufferModel);
+    else if (framework == "tensorflow")
+        return readNetFromTensorflow(bufferModel, bufferConfig);
+    else if (framework == "darknet")
+        return readNetFromDarknet(bufferConfig, bufferModel);
+    else if (framework == "torch")
+        CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
+    else if (framework == "dldt")
+        CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
+    CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
+}
+
 Net readNetFromModelOptimizer(const String &xml, const String &bin)
 {
    return Net::readFromModelOptimizer(xml, bin);
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -295,7 +295,9 @@ public:
        for (int i = 0; i < num; i++)
            confPreds.push_back(Mat(2, shape, CV_32F));

-        UMat umat = inp1.reshape(1, num * numPredsPerClass);
+        shape[0] = num * numPredsPerClass;
+        shape[1] = inp1.total() / shape[0];
+        UMat umat = inp1.reshape(1, 2, &shape[0]);
        for (int i = 0; i < num; ++i)
        {
            Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
@ -342,7 +344,7 @@ public:
            // Decode all loc predictions to bboxes
            bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
                                           _shareLocation, _numLocClasses, _backgroundLabelId,
-                                           _codeType, _varianceEncodedInTarget, false,
+                                           _codeType, _varianceEncodedInTarget, _clip,
                                           allDecodedBBoxes);
            if (!ret)
                return false;
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -369,15 +369,11 @@ public:
        // clip the prior's coordinate such that it is within [0, 1]
        if (_clip)
        {
-            Mat mat = outputs[0].getMat(ACCESS_READ);
-            int aspect_count = (_maxSize > 0) ? 1 : 0;
-            int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
-            float* outputPtr = mat.ptr<float>() + offset;
-            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
-            for (size_t d = 0; d < _outChannelSize; ++d)
-            {
-                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
-            }
+            ocl::Kernel kernel("clip", ocl::dnn::prior_box_oclsrc, opts);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors * 4;
+            if (!kernel.args((int)nthreads, ocl::KernelArg::PtrReadWrite(outputs[0]))
+                       .run(1, &nthreads, NULL, false))
+                return false;
        }

        // set the variance.
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -14,7 +14,7 @@ namespace cv { namespace dnn {
 class ResizeLayerImpl : public ResizeLayer
 {
 public:
-    ResizeLayerImpl(const LayerParams& params)
+    ResizeLayerImpl(const LayerParams& params) : scaleWidth(0), scaleHeight(0)
    {
        setParamsFrom(params);
        outWidth = params.get<float>("width", 0);
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@ -14,6 +14,7 @@ public:
    ShuffleChannelLayerImpl(const LayerParams& params)
    {
        group = params.get<int>("group", 1);
+        setParamsFrom(params);
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -110,27 +110,26 @@ public:
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);

+        UMat& src = inputs[0];
+        UMat& dstMat = outputs[0];
+        int axis = clamp(axisRaw, src.dims);
+
        if (softmaxOp.empty())
        {
            OCL4DNNSoftmaxConfig config;
-
            config.in_shape = shape(inputs[0]);
-            config.axis = axisRaw;
-            config.channels = inputs[0].size[axisRaw];
+            config.axis = axis;
+            config.channels = inputs[0].size[axis];
            config.logsoftmax = logSoftMax;
            config.use_half = use_half;

            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
        }

-        UMat& src = inputs[0];
-        UMat& dstMat = outputs[0];
-
        if (softmaxOp->Forward(src, dstMat))
            return true;

        UMat& bufMat = internals[0];
-        int axis = clamp(axisRaw, src.dims);
        MatShape s = shape(src);
        size_t outerSize = total(s, 0, axis);
        size_t channels = src.size[axis];
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@ -612,7 +612,7 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
            ret = k.run(1, globalsize, localsize, false);
        }

-        if ((row_size % 4) != 0 && ret)
+        if (row_size < 4 || ((row_size % 4) != 0 && ret))
        {
            String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
            ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -821,7 +821,7 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
    cl_int err;
    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);

-    region.origin = offset * element_size;
+    region.origin = offset * element_size + buffer.offset;
    region.size = size * element_size;
    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
@ -853,6 +853,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        return false;

    int32_t bias_offset;
+    int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);

    if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
        if (!swizzleWeight(weight, config->workItem_output[2], false))
@ -931,10 +932,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                    return false;

                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
            }
            else
            {
                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
            }

            kernel.set(argIdx++, (uint16_t)width_);
@ -1024,10 +1027,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                    return false;

                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
            }
            else
            {
                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
            }

            kernel.set(argIdx++, (uint16_t)width_);
@ -1079,6 +1084,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        if (bias_term_)
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (int)(top.offset / element_size));
        kernel.set(argIdx++, (uint16_t)width_);
        kernel.set(argIdx++, (uint16_t)height_);
        kernel.set(argIdx++, (uint16_t)output_w_);
@ -1126,6 +1132,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                    kernel.set(argIdx++, (void *)NULL);
                kernel.set(argIdx++, bias_offset);
                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
                kernel.set(argIdx++, output_image_offset);
                kernel.set(argIdx++, (uint16_t)width_);
                kernel.set(argIdx++, (uint16_t)height_);
@ -1230,20 +1237,22 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
    tuned_ = saved_tuned;

    UMat new_top, new_verify_top;
-    float *data, *verify_data;
+    Mat mat_top, mat_verify_top;
    if (use_half_)
    {
        convertFp16(top, new_top);
        convertFp16(verifyTop, new_verify_top);

-        data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
-        verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
+        mat_top = new_top.getMat(ACCESS_READ);
+        mat_verify_top = new_verify_top.getMat(ACCESS_READ);
    }
    else
    {
-        data = (float *)top.getMat(ACCESS_READ).ptr<float>();
-        verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+        mat_top = top.getMat(ACCESS_READ);
+        mat_verify_top = verifyTop.getMat(ACCESS_READ);
    }
+    const float* data = mat_top.ptr<float>();
+    const float* verify_data = mat_verify_top.ptr<float>();

    for (int32_t n = 0; n < num_; ++n) {
        for (int32_t g = 0; g < group_; ++g) {
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -136,7 +136,8 @@ __kernel void ConvolveBasic(
    int kernel_offset,
    __global Dtype* bias,
    const int bias_offset,
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_base_offset,
    const int convolved_image_offset,
    const ushort input_width,
    const ushort input_height,
@ -146,6 +147,7 @@ __kernel void ConvolveBasic(
    const ushort pad_h
 )
 {
+    __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
    const int outputX = get_global_id(0);
    const int outputY = get_global_id(1);
    const int kernelNum = get_global_id(2) * ZPAR;
@ -220,12 +222,14 @@ convolve_simd(
    __global Dtype* inputs,
    __global Dtype* weights,
    BIAS_KERNEL_ARG
-    __global Dtype* outputs,
+    __global Dtype* outputs_base,
+    const int outputs_offset,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height)
 {
+  __global Dtype* outputs = outputs_base + outputs_offset;
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
@ -395,7 +399,8 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
    const __global Dtype *src0,   \
    const __global Dtype *src1,   \
    BIAS_KERNEL_ARG               \
-    __global Dtype *dst,          \
+    __global Dtype *dst_base,     \
+    const int dst_offset,         \
    const ushort input_width,     \
    const ushort input_height,    \
    const ushort output_width,    \
@ -425,6 +430,7 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -813,6 +819,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -1374,6 +1381,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -1559,6 +1567,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    __global Dtype *dst = dst_base + dst_offset;
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -1770,12 +1779,13 @@ __kernel void DWCONV(
    __global Dtype* image_data,
    __global Dtype* kernel_data,
    BIAS_KERNEL_ARG
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_offset,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height) {
-
+  __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
  const int outputX = get_global_id(0);
  const int outputY = get_global_id(1);
  const int outputZ = get_global_id(2);
--- a/modules/dnn/src/opencl/prior_box.cl
+++ b/modules/dnn/src/opencl/prior_box.cl
@ -107,3 +107,13 @@ __kernel void set_variance(const int nthreads,
        vstore4(var_vec, 0, dst + offset + index * 4);
    }
 }
+
+__kernel void clip(const int nthreads,
+                   __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 vec = vload4(index, dst);
+        vstore4(clamp(vec, 0, 1), index, dst);
+    }
+}
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1856,5 +1856,14 @@ Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
    return net;
 }

+Net readNetFromTensorflow(const std::vector<uchar>& bufferModel, const std::vector<uchar>& bufferConfig)
+{
+    const char* bufferModelPtr = reinterpret_cast<const char*>(&bufferModel[0]);
+    const char* bufferConfigPtr = bufferConfig.empty() ? NULL :
+                                  reinterpret_cast<const char*>(&bufferConfig[0]);
+    return readNetFromTensorflow(bufferModelPtr, bufferModel.size(),
+                                 bufferConfigPtr, bufferConfig.size());
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }} // namespace
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -65,6 +65,34 @@ TEST(Test_Darknet, read_yolo_voc)
    ASSERT_FALSE(net.empty());
 }

+TEST(Test_Darknet, read_yolo_voc_stream)
+{
+    Mat ref;
+    Mat sample = imread(_tf("dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0/255, Size(416, 416), Scalar(), true, false);
+    const std::string cfgFile = findDataFile("dnn/yolo-voc.cfg", false);
+    const std::string weightsFile = findDataFile("dnn/yolo-voc.weights", false);
+    // Import by paths.
+    {
+        Net net = readNetFromDarknet(cfgFile, weightsFile);
+        net.setInput(inp);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        ref = net.forward();
+    }
+    // Import from bytes array.
+    {
+        std::string cfg, weights;
+        readFileInMemory(cfgFile, cfg);
+        readFileInMemory(weightsFile, weights);
+
+        Net net = readNetFromDarknet(&cfg[0], cfg.size(), &weights[0], weights.size());
+        net.setInput(inp);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        Mat out = net.forward();
+        normAssert(ref, out);
+    }
+}
+
 class Test_Darknet_layers : public DNNTestLayer
 {
 public:
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -104,8 +104,14 @@ TEST_P(Convolution, Accuracy)
    int backendId = get<0>(get<7>(GetParam()));
    int targetId = get<1>(get<7>(GetParam()));

-    if ((backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) ||
-        (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+
+    // TODO: unstable test cases
+    if (backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
+        inChannels == 6 && outChannels == 9 && group == 1 && inSize == Size(5, 6) &&
+        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1) && dilation == Size(1, 1) &&
+        hasBias)
        throw SkipTestException("");

    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
@ -353,8 +359,7 @@ TEST_P(FullyConnected, Accuracy)
    bool hasBias = get<3>(GetParam());
    int backendId = get<0>(get<4>(GetParam()));
    int targetId = get<1>(get<4>(GetParam()));
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE ||
-        (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");

    Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
@ -692,10 +697,6 @@ TEST_P(Eltwise, Accuracy)
    int backendId = get<0>(get<4>(GetParam()));
    int targetId = get<1>(get<4>(GetParam()));

-    if (backendId == DNN_BACKEND_OPENCV &&
-        (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
-        throw SkipTestException("");
-
    Net net;

    std::vector<int> convLayerIds(numConv);
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -763,8 +763,7 @@ TEST_P(Test_Caffe_layers, Average_pooling_kernel_area)
 // Test PriorBoxLayer in case of no aspect ratios (just squared proposals).
 TEST_P(Test_Caffe_layers, PriorBox_squares)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
-        (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)))
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");
    LayerParams lp;
    lp.name = "testPriorBox";
@ -791,7 +790,8 @@ TEST_P(Test_Caffe_layers, PriorBox_squares)
                                       0.25, 0.0, 1.0, 1.0,
                                       0.1f, 0.1f, 0.2f, 0.2f,
                                       0.1f, 0.1f, 0.2f, 0.2f);
-    normAssert(out.reshape(1, 4), ref);
+    double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-5 : 1e-5;
+    normAssert(out.reshape(1, 4), ref, "", l1);
 }

 typedef TestWithParam<tuple<int, int> > Layer_Test_DWconv_Prelu;
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -243,10 +243,15 @@ TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
    runTensorFlowNet("l2_normalize_3d");
 }

-typedef testing::TestWithParam<Target> Test_TensorFlow_nets;
+class Test_TensorFlow_nets : public DNNTestLayer {};

 TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
 {
+    checkBackend();
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+
    std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
    std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
    std::string imgPath = findDataFile("dnn/street.png", false);
@ -260,29 +265,30 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
    outNames[1] = "concat_1";
    outNames[2] = "detection_out";

-    std::vector<Mat> target(outNames.size());
+    std::vector<Mat> refs(outNames.size());
    for (int i = 0; i < outNames.size(); ++i)
    {
        std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
-        target[i] = blobFromNPY(path);
+        refs[i] = blobFromNPY(path);
    }

    Net net = readNetFromTensorflow(netPath, netConfig);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    net.setInput(inp);

    std::vector<Mat> output;
    net.forward(output, outNames);

-    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
-    normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
-    normAssertDetections(target[2], output[2], "", 0.2);
+    normAssert(refs[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
+    normAssert(refs[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
+    normAssertDetections(refs[2], output[2], "", 0.2);
 }

 TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 {
+    checkBackend();
    std::string proto = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", false);
    std::string model = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false);

@ -290,8 +296,8 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
    Mat img = imread(findDataFile("dnn/street.png", false));
    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);

-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    net.setInput(blob);
    // Output has shape 1x1xNx7 where N - number of detections.
@ -302,16 +308,24 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
                                    0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
-    normAssertDetections(ref, out, "", 0.5);
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.025 : default_lInf;
+    normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
 }

 TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
 {
+    checkBackend();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+
    std::string proto = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
    std::string model = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false);

    Net net = readNetFromTensorflow(model, proto);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
    Mat img = imread(findDataFile("dnn/dog416.png", false));
    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(800, 600), Scalar(127.5, 127.5, 127.5), true, false);

@ -324,6 +338,11 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)

 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 {
+    checkBackend();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
+        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD))
+        throw SkipTestException("");
+
    std::string proto = findDataFile("dnn/opencv_face_detector.pbtxt", false);
    std::string model = findDataFile("dnn/opencv_face_detector_uint8.pb", false);

@ -331,9 +350,8 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
    Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
    Mat blob = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);

-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(GetParam());
-
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
    net.setInput(blob);
    // Output has shape 1x1xNx7 where N - number of detections.
    // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
@ -346,7 +364,9 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
                                    0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
                                    0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
                                    0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssertDetections(ref, out, "", 0.9, 3.4e-3, 1e-2);
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 4e-3 : 3.4e-3;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.017 : 1e-2;
+    normAssertDetections(ref, out, "", 0.9, scoreDiff, iouDiff);
 }

 // inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
@ -360,6 +380,10 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 // np.save('east_text_detection.geometry.npy', geometry)
 TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 {
+    checkBackend();
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+
    std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
    std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
    std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
@ -367,7 +391,8 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)

    Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));

-    net.setPreferableTarget(GetParam());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    Mat img = imread(imgPath);
    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
@ -386,7 +411,7 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
    normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 1e-4, 3e-3);
 }

-INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, dnnBackendsAndTargets());

 TEST_P(Test_TensorFlow_layers, fp16_weights)
 {
--- a/modules/features2d/src/bagofwords.cpp
+++ b/modules/features2d/src/bagofwords.cpp
@ -177,6 +177,7 @@ void BOWImgDescriptorExtractor::compute( InputArray keypointDescriptors, OutputA
    CV_INSTRUMENT_REGION()

    CV_Assert( !vocabulary.empty() );
+    CV_Assert(!keypointDescriptors.empty());

    int clusterCount = descriptorSize(); // = vocabulary.rows

--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@ -264,6 +264,8 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
            convexHull(Mat(contours[contourIdx]), hull);
            double area = contourArea(Mat(contours[contourIdx]));
            double hullArea = contourArea(Mat(hull));
+            if (fabs(hullArea) < DBL_EPSILON)
+                continue;
            double ratio = area / hullArea;
            if (ratio < params.minConvexity || ratio >= params.maxConvexity)
                continue;
@ -309,6 +311,7 @@ void SimpleBlobDetectorImpl::detect(InputArray image, std::vector<cv::KeyPoint>&
    CV_INSTRUMENT_REGION()

    keypoints.clear();
+    CV_Assert(params.minRepeatability != 0);
    Mat grayscaleImage;
    if (image.channels() == 3 || image.channels() == 4)
        cvtColor(image, grayscaleImage, COLOR_BGR2GRAY);
--- a/modules/features2d/src/brisk.cpp
+++ b/modules/features2d/src/brisk.cpp
@ -506,6 +506,7 @@ BRISK_Impl::smoothedIntensity(const cv::Mat& image, const cv::Mat& integral, con
  // scaling:
  const int scaling = (int)(4194304.0 / area);
  const int scaling2 = int(float(scaling) * area / 1024.0);
+  CV_Assert(scaling2 != 0);

  // the integral image is larger:
  const int integralcols = imagecols + 1;
@ -2238,6 +2239,7 @@ BriskLayer::value(const cv::Mat& mat, float xf, float yf, float scale_in) const
  // scaling:
  const int scaling = (int)(4194304.0f / area);
  const int scaling2 = (int)(float(scaling) * area / 1024.0f);
+  CV_Assert(scaling2 != 0);

  // calculate borders
  const float x_1 = xf - sigma_half;
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@ -271,7 +271,7 @@ void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
    }
 }

-PERF_TEST(Transform, getPerspectiveTransform)
+PERF_TEST(Transform, getPerspectiveTransform_1000)
 {
    unsigned int size = 8;
    Mat source(1, size/2, CV_32FC2);
@ -280,12 +280,14 @@ PERF_TEST(Transform, getPerspectiveTransform)

    declare.in(source, destination, WARMUP_RNG);

-    TEST_CYCLE()
+    PERF_SAMPLE_BEGIN()
+    for (int i = 0; i < 1000; i++)
    {
        transformCoefficient = getPerspectiveTransform(source, destination);
    }
+    PERF_SAMPLE_END()

-    SANITY_CHECK(transformCoefficient, 1e-5);
+    SANITY_CHECK_NOTHING();
 }

 } // namespace
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@ -209,7 +209,14 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
        CV_Error( CV_StsBadSize, "" );

    CvContourScanner scanner = (CvContourScanner)cvAlloc( sizeof( *scanner ));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
    memset( scanner, 0, sizeof(*scanner) );
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif

    scanner->storage1 = scanner->storage2 = storage;
    scanner->img0 = (schar *) img;
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@ -546,10 +546,10 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
    size_t lmsz = dev.localMemSize();
    size_t src_step = _src.step(), src_offset = _src.offset();
    const size_t tileSizeYmax = wgs / tileSizeX;
+    CV_Assert(src_step != 0 && esz != 0);

    // workaround for NVIDIA: 3 channel vector type takes 4*elem_size in local memory
    int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
-
    if (((src_offset % src_step) % esz == 0) &&
        (
         (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@ -2563,6 +2563,11 @@ static const int CodeDeltas[8][2] =
 #define CV_ADJUST_EDGE_COUNT( count, seq )  \
    ((count) -= ((count) == (seq)->total && !CV_IS_SEQ_CLOSED(seq)))

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 CV_IMPL void
 cvDrawContours( void* _img, CvSeq* contour,
                CvScalar _externalColor, CvScalar _holeColor,
@ -2894,4 +2899,8 @@ cvGetTextSize( const char *text, const CvFont *_font, CvSize *_size, int *_base_
        *_size = size;
 }

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop // "-Wclass-memaccess"
+#endif
+
 /* End of file. */
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -4284,10 +4284,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
    size_t src_step = _src.step(), src_offset = _src.offset();
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
-            !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
-              borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
-              borderType == BORDER_REFLECT_101))
+    if (esz == 0
+        || (src_offset % src_step) % esz != 0
+        || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        || !(borderType == BORDER_CONSTANT
+             || borderType == BORDER_REPLICATE
+             || borderType == BORDER_REFLECT
+             || borderType == BORDER_WRAP
+             || borderType == BORDER_REFLECT_101))
        return false;

    size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@ -642,8 +642,15 @@ cvFloodFill( CvArr* arr, CvPoint seed_point,
             CvScalar newVal, CvScalar lo_diff, CvScalar up_diff,
             CvConnectedComp* comp, int flags, CvArr* maskarr )
 {
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
    if( comp )
        memset( comp, 0, sizeof(*comp) );
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif

    cv::Mat img = cv::cvarrToMat(arr), mask = cv::cvarrToMat(maskarr);
    int area = cv::floodFill(img, mask, seed_point, newVal,
--- a/modules/imgproc/src/grabcut.cpp
+++ b/modules/imgproc/src/grabcut.cpp
@ -174,6 +174,7 @@ void GMM::addSample( int ci, const Vec3d color )

 void GMM::endLearning()
 {
+    CV_Assert(totalSampleCount > 0);
    const double variance = 0.01;
    for( int ci = 0; ci < componentsCount; ci++ )
    {
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -50,6 +50,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "hal_replacement.hpp"
+#include <opencv2/core/utils/configuration.private.hpp>
 #include "opencv2/core/hal/intrin.hpp"
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #include "opencv2/core/softfloat.hpp"
@ -3061,7 +3062,9 @@ cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
        b[i+4] = dst[i].y;
    }

-    solve( A, B, X, DECOMP_SVD );
+    static int param_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD =
+        (int)utils::getConfigurationParameterSizeT("OPENCV_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD", (size_t)DECOMP_LU);
+    solve(A, B, X, param_IMGPROC_GETPERSPECTIVETRANSFORM_SOLVE_METHOD);
    M.ptr<double>()[8] = 1.;

    return M;
@ -3283,6 +3286,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,

    if (!(flags & CV_WARP_INVERSE_MAP))
    {
+        CV_Assert(!dsize.empty());
        double Kangle = CV_2PI / dsize.height;
        int phi, rho;

@ -3329,6 +3333,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
        Mat src = _dst.getMat();
        Size ssize = _dst.size();
        ssize.height -= 2 * ANGLE_BORDER;
+        CV_Assert(!ssize.empty());
        const double Kangle = CV_2PI / ssize.height;
        double Kmag;
        if (semiLog)
--- a/modules/imgproc/src/linefit.cpp
+++ b/modules/imgproc/src/linefit.cpp
@ -47,6 +47,7 @@ static const double eps = 1e-6;

 static void fitLine2D_wods( const Point2f* points, int count, float *weights, float *line )
 {
+    CV_Assert(count > 0);
    double x = 0, y = 0, x2 = 0, y2 = 0, xy = 0, w = 0;
    double dx2, dy2, dxy;
    int i;
@ -98,6 +99,7 @@ static void fitLine2D_wods( const Point2f* points, int count, float *weights, fl

 static void fitLine3D_wods( const Point3f * points, int count, float *weights, float *line )
 {
+    CV_Assert(count > 0);
    int i;
    float w0 = 0;
    float x0 = 0, y0 = 0, z0 = 0;
--- a/modules/imgproc/src/lsd.cpp
+++ b/modules/imgproc/src/lsd.cpp
@ -772,6 +772,7 @@ bool LineSegmentDetectorImpl::refine(std::vector<RegionPoint>& reg, double reg_a
            ++n;
        }
    }
+    CV_Assert(n > 0);
    double mean_angle = sum / double(n);
    // 2 * standard deviation
    double tau = 2.0 * sqrt((s_sum - 2.0 * mean_angle * sum) / double(n) + mean_angle * mean_angle);
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@ -495,6 +495,13 @@ static bool ocl_moments( InputArray _src, Moments& m, bool binary)
    const int TILE_SIZE = 32;
    const int K = 10;

+    Size sz = _src.getSz();
+    int xtiles = divUp(sz.width, TILE_SIZE);
+    int ytiles = divUp(sz.height, TILE_SIZE);
+    int ntiles = xtiles*ytiles;
+    if (ntiles == 0)
+        return false;
+
    ocl::Kernel k = ocl::Kernel("moments", ocl::imgproc::moments_oclsrc,
        format("-D TILE_SIZE=%d%s",
        TILE_SIZE,
@ -504,10 +511,6 @@ static bool ocl_moments( InputArray _src, Moments& m, bool binary)
        return false;

    UMat src = _src.getUMat();
-    Size sz = src.size();
-    int xtiles = (sz.width + TILE_SIZE-1)/TILE_SIZE;
-    int ytiles = (sz.height + TILE_SIZE-1)/TILE_SIZE;
-    int ntiles = xtiles*ytiles;
    UMat umbuf(1, ntiles*K, CV_32S);

    size_t globalsize[] = {(size_t)xtiles, std::max((size_t)TILE_SIZE, (size_t)sz.height)};
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@ -1709,6 +1709,7 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,

 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
 {
+    CV_Assert(n > 0);
    const int SMALL_GAUSSIAN_SIZE = 7;
    static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
    {
@ -1747,6 +1748,7 @@ cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
        }
    }

+    CV_DbgAssert(fabs(sum) > 0);
    sum = 1./sum;
    for( i = 0; i < n; i++ )
    {
@ -5329,6 +5331,7 @@ public:
                        wsum += w;
                    }
                    // overflow is not possible here => there is no need to use cv::saturate_cast
+                    CV_DbgAssert(fabs(wsum) > 0);
                    dptr[j] = (uchar)cvRound(sum/wsum);
                }
            }
@ -5414,6 +5417,7 @@ public:
                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
                        wsum += w;
                    }
+                    CV_DbgAssert(fabs(wsum) > 0);
                    wsum = 1.f/wsum;
                    b0 = cvRound(sum_b*wsum);
                    g0 = cvRound(sum_g*wsum);
@ -5673,6 +5677,7 @@ public:
                        sum += val*w;
                        wsum += w;
                    }
+                    CV_DbgAssert(fabs(wsum) > 0);
                    dptr[j] = (float)(sum/wsum);
                }
            }
@ -5763,6 +5768,7 @@ public:
                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
                        wsum += w;
                    }
+                    CV_DbgAssert(fabs(wsum) > 0);
                    wsum = 1.f/wsum;
                    b0 = sum_b*wsum;
                    g0 = sum_g*wsum;
--- a/modules/imgproc/test/test_grabcut.cpp
+++ b/modules/imgproc/test/test_grabcut.cpp
@ -89,7 +89,6 @@ void CV_GrabcutTest::run( int /* start_from */)
    Mat exp_bgdModel, exp_fgdModel;

    Mat mask;
-    mask = Scalar(0);
    Mat bgdModel, fgdModel;
    grabCut( img, mask, rect, bgdModel, fgdModel, 0, GC_INIT_WITH_RECT );
    bgdModel.copyTo(exp_bgdModel);
--- a/modules/imgproc/test/test_moments.cpp
+++ b/modules/imgproc/test/test_moments.cpp
@ -186,7 +186,14 @@ void CV_MomentsTest::prepare_to_validation( int /*test_case_idx*/ )
    int i, y, x, cols = src.cols;
    double xc = 0., yc = 0.;

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
    memset( &m, 0, sizeof(m));
+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif

    int coi = 0;
    for( y = 0; y < src.rows; y++ )
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@ -67,6 +67,10 @@ type_dict = {
    "double[]": { "j_type" : "double[]", "jn_type" : "double[]", "jni_type" : "jdoubleArray", "suffix" : "_3D" }
 }

+# Defines a rule to add extra prefixes for names from specific namespaces.
+# In example, cv::fisheye::stereoRectify from namespace fisheye is wrapped as fisheye_stereoRectify
+namespaces_dict = {}
+
 # { class : { func : {j_code, jn_code, cpp_code} } }
 ManualFuncs = {}

@ -148,6 +152,8 @@ class ConstInfo(GeneralInfo):
        self.cname = self.name.replace(".", "::")
        self.value = decl[1]
        self.addedManually = addedManually
+        if self.namespace in namespaces_dict:
+            self.name = '%s_%s' % (namespaces_dict[self.namespace], self.name)

    def __repr__(self):
        return Template("CONST $name=$value$manual").substitute(name=self.name,
@ -297,11 +303,13 @@ class ArgInfo():
 class FuncInfo(GeneralInfo):
    def __init__(self, decl, namespaces=[]): # [ funcname, return_ctype, [modifiers], [args] ]
        GeneralInfo.__init__(self, "func", decl, namespaces)
-        self.cname = self.name.replace(".", "::")
+        self.cname = decl[0].replace(".", "::")
        self.jname = self.name
        self.isconstructor = self.name == self.classname
        if "[" in self.name:
            self.jname = "getelem"
+        if self.namespace in namespaces_dict:
+            self.jname = '%s_%s' % (namespaces_dict[self.namespace], self.jname)
        for m in decl[2]:
            if m.startswith("="):
                self.jname = m[1:]
@ -688,9 +696,9 @@ class JavaWrapperGenerator(object):
            # java part:

            #java doc comment
-            f_name = fi.name
+            f_name = fi.jname
            if fi.classname:
-                f_name = fi.classname + "::" + fi.name
+                f_name = fi.classname + "::" + fi.jname
            java_doc = "//javadoc: " + f_name + "(%s)" % ", ".join([a.name for a in args if a.ctype])
            j_code.write(" "*4 + java_doc + "\n")

@ -897,13 +905,10 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
            j_signatures.append(j_signature)

            # processing args with default values
-            if not args or not args[-1].defval:
+            if args and args[-1].defval:
+                args.pop()
+            else:
                break
-            while args and args[-1].defval:
-                # 'smart' overloads filtering
-                a = args.pop()
-                if a.name in ('mask', 'dtype', 'ddepth', 'lineType', 'borderType', 'borderMode', 'criteria'):
-                    break



@ -1146,6 +1151,7 @@ if __name__ == "__main__":
            type_dict.update(gen_type_dict.get("type_dict", {}))
            ManualFuncs.update(gen_type_dict.get("ManualFuncs", {}))
            func_arg_fix.update(gen_type_dict.get("func_arg_fix", {}))
+            namespaces_dict.update(gen_type_dict.get("namespaces_dict", {}))
            if 'module_j_code' in gen_type_dict:
                module_j_code = read_contents(checkFileRemap(os.path.join(misc_location, gen_type_dict['module_j_code'])))
            if 'module_jn_code' in gen_type_dict:
--- a/modules/ml/src/em.cpp
+++ b/modules/ml/src/em.cpp
@ -616,6 +616,7 @@ public:
            expDiffSum += v; // sum_j(exp(L_ij - L_iq))
        }

+        CV_Assert(expDiffSum > 0);
        if(probs)
            L.convertTo(*probs, ptype, 1./expDiffSum);

--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@ -170,6 +170,7 @@ public:
                double val = std::abs(w->ord_responses[w->sidx[i]]);
                max_response = std::max(max_response, val);
            }
+            CV_Assert(fabs(max_response) > 0);
        }

        if( rparams.calcVarImportance )
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@ -630,7 +630,7 @@ void DTreesImpl::calcValue( int nidx, const vector<int>& _sidx )
                w->cv_Tn[nidx*cv_n + j] = INT_MAX;
            }
        }
-
+        CV_Assert(fabs(sumw) > 0);
        node->node_risk = sum2 - (sum/sumw)*sum;
        node->node_risk /= sumw;
        node->value = sum/sumw;
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@ -670,6 +670,21 @@ public:
    void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
 };

+class CV_EXPORTS QRCodeDetector
+{
+public:
+    QRCodeDetector();
+    ~QRCodeDetector();
+
+    void setEpsX(double epsX);
+    void setEpsY(double epsY);
+
+    bool detect(InputArray in, OutputArray points) const;
+protected:
+    struct Impl;
+    Ptr<Impl> p;
+};
+
 /** @brief Detect QR code in image and return minimum area of quadrangle that describes QR code.
    @param in  Matrix of the type CV_8UC1 containing an image where QR code are detected.
    @param points Output vector of vertices of a quadrangle of minimal area that describes QR code.
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@ -67,6 +67,11 @@
 #  endif
 #endif

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
 #define CV_ADJUST_WEIGHTS  0
@ -599,7 +604,7 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
                    else
                        sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
                }
-
+                CV_Assert(area0 > 0);
                hidfeature->rect[0].weight = (float)(-sum0/area0);
            } /* l */
        } /* j */
@ -2290,4 +2295,8 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier,
                  icvReadHaarClassifier, icvWriteHaarClassifier,
                  icvCloneHaarClassifier );

+#if defined __GNUC__ && __GNUC__ >= 8
+#pragma GCC diagnostic pop
+#endif
+
 /* End of file. */
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@ -1,74 +1,115 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.

 #include "test_precomp.hpp"

-namespace opencv_test { namespace {

-TEST(Objdetect_QRCode, regression)
+namespace opencv_test
+{
+
+String qrcode_images_name[] = {
+    "20110817_030.jpg",
+    "20110817_048.jpg",
+    "img_20120226_161648.jpg",
+    "img_2714.jpg",
+    "img_2716.jpg",
+    "img_3011.jpg",
+    "img_3029.jpg",
+    "img_3070.jpg",
+    "qr_test_030.jpg"
+};
+
+// #define UPDATE_QRCODE_TEST_DATA
+#ifdef  UPDATE_QRCODE_TEST_DATA
+
+TEST(Objdetect_QRCode, generate_test_data)
 {
    String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
-    // String cascades[] =
-    // {
-        // root + "haarcascade_frontalface_alt.xml",
-        // root + "lbpcascade_frontalface.xml",
-        // String()
-    // };
-
-    // vector<Rect> objects;
-    // RNG rng((uint64)-1);
-
-    // for( int i = 0; !cascades[i].empty(); i++ )
-    // {
-        // printf("%d. %s\n", i, cascades[i].c_str());
-        // CascadeClassifier cascade(cascades[i]);
-        // for( int j = 0; j < 100; j++ )
-        // {
-            // int width = rng.uniform(1, 100);
-            // int height = rng.uniform(1, 100);
-            // Mat img(height, width, CV_8U);
-            // randu(img, 0, 256);
-            // cascade.detectMultiScale(img, objects);
-        // }
-    // }
+    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
+    FileStorage file_config(dataset_config, FileStorage::WRITE);
+
+    file_config << "test_images" << "[";
+    size_t images_count = sizeof(qrcode_images_name) / sizeof(String);
+    for (size_t i = 0; i < images_count; i++)
+    {
+        file_config << "{:" << "image_name" << qrcode_images_name[i];
+        String image_path = root + qrcode_images_name[i];
+        std::vector<Point> transform;
+        Mat src = imread(image_path, IMREAD_GRAYSCALE);
+        EXPECT_TRUE(detectQRCode(src, transform));
+        file_config << "x" << "[:";
+        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].x; }
+        file_config << "]";
+        file_config << "y" << "[:";
+        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].y; }
+        file_config << "]" << "}";
+    }
+    file_config << "]";
+    file_config.release();
 }

-}} // namespace
+#else
+
+typedef testing::TestWithParam< String > Objdetect_QRCode;
+TEST_P(Objdetect_QRCode, regression)
+{
+    String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
+    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
+    FileStorage file_config(dataset_config, FileStorage::READ);
+    const int pixels_error = 3;
+
+    std::vector<Point> corners;
+    String image_path = root + String(GetParam());
+    Mat src = imread(image_path, IMREAD_GRAYSCALE);
+    EXPECT_TRUE(detectQRCode(src, corners));
+
+    if (file_config.isOpened())
+    {
+        FileNode images_list = file_config["test_images"];
+        int index = 0, images_count = static_cast<int>(images_list.size());
+        ASSERT_GT(images_count, 0);
+
+        bool runTestsFlag = false;
+        String name_current_image = String(GetParam());
+        for (; index < images_count; index++)
+        {
+            String name_test_image = images_list[index]["image_name"];
+            if (name_test_image == name_current_image)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    int x = images_list[index]["x"][i];
+                    int y = images_list[index]["y"][i];
+                    EXPECT_NEAR(x, corners[i].x, pixels_error);
+                    EXPECT_NEAR(y, corners[i].y, pixels_error);
+                }
+                runTestsFlag = true;
+            }
+        }
+        if (!runTestsFlag)
+        {
+            std::cout << "Not found results for " << name_current_image;
+            std::cout << " image in dataset_config.json file." << std::endl;
+        }
+
+        file_config.release();
+    }
+    else
+    {
+        std::cout << " Not found dataset_config.json file." << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(objdetect, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name));
+
+TEST(Objdetect_QRCode, not_found_qrcode)
+{
+    std::vector<Point> corners;
+    Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
+    EXPECT_FALSE(detectQRCode(zero_image, corners));
+}
+
+#endif
+
+} // namespace
--- a/modules/photo/src/tonemap.cpp
+++ b/modules/photo/src/tonemap.cpp
@ -140,6 +140,7 @@ public:

        double max;
        minMaxLoc(gray_img, NULL, &max);
+        CV_Assert(max > 0);

        Mat map;
        log(gray_img + 1.0f, map);
@ -429,12 +430,15 @@ public:
        for(int i = 0; i < max_iterations; i++)
        {
            calculateProduct(p, product);
-            float alpha = rr / static_cast<float>(p.dot(product));
+            double dprod = p.dot(product);
+            CV_Assert(fabs(dprod) > 0);
+            float alpha = rr / static_cast<float>(dprod);

            r -= alpha * product;
            x += alpha * p;

            float new_rr = static_cast<float>(r.dot(r));
+            CV_Assert(fabs(rr) > 0);
            p = r + (new_rr / rr) * p;
            rr = new_rr;

--- a/modules/shape/src/sc_dis.cpp
+++ b/modules/shape/src/sc_dis.cpp
@ -743,6 +743,7 @@ void SCDMatcher::hungarian(cv::Mat &costMatrix, std::vector<cv::DMatch> &outMatc

    // calculate symmetric shape context cost
    cv::Mat trueCostMatrix(costMatrix, cv::Rect(0,0,sizeScd1, sizeScd2));
+    CV_Assert(!trueCostMatrix.empty());
    float leftcost = 0;
    for (int nrow=0; nrow<trueCostMatrix.rows; nrow++)
    {
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@ -2125,7 +2125,7 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
    switch( code )
    {
    case CMP_EPS_BIG_DIFF:
-        sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
+        sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
        code = TS::FAIL_BAD_ACCURACY;
        break;
    case CMP_EPS_INVALID_TEST_DATA:
--- a/modules/videoio/include/opencv2/videoio/container_avi.private.hpp
+++ b/modules/videoio/include/opencv2/videoio/container_avi.private.hpp
@ -153,7 +153,7 @@ public:
    bool initContainer(const String& filename, double fps, Size size, bool iscolor);
    void startWriteAVI(int stream_count);
    void writeStreamHeader(Codecs codec_);
-    void startWriteChunk(int fourcc);
+    void startWriteChunk(uint32_t fourcc);
    void endWriteChunk();

    int getAVIIndex(int stream_number, StreamType strm_type);
--- a/Show More
+++ b/Show More