Merge remote-tracking branch 'upstream/3.4' into merge-3.4

6 years ago · 631b246881
parent e1c8256602 dcdbaef348
commit 631b246881
68 changed files with 2446 additions and 1110 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -577,7 +577,7 @@ else()
    # Note: layout differs from OpenCV 3.4
    include(GNUInstallDirs)
    ocv_update(OPENCV_INCLUDE_INSTALL_PATH       "${CMAKE_INSTALL_INCLUDEDIR}/opencv4")
-    ocv_update(OPENCV_LIB_INSTALL_PATH           "${CMAKE_INSTALL_LIBDIR}${LIB_SUFFIX}")
+    ocv_update(OPENCV_LIB_INSTALL_PATH           "${CMAKE_INSTALL_LIBDIR}")
    ocv_update(OPENCV_CONFIG_INSTALL_PATH        "${OPENCV_LIB_INSTALL_PATH}/cmake/opencv4")
    ocv_update(OPENCV_3P_LIB_INSTALL_PATH        "${OPENCV_LIB_INSTALL_PATH}/opencv4/3rdparty")
    ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH   "${CMAKE_INSTALL_DATAROOTDIR}/opencv4/samples")
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -144,6 +144,7 @@ if(DOXYGEN_FOUND)
  string(REPLACE ";" " " CMAKE_DOXYGEN_ENABLED_SECTIONS "${CMAKE_DOXYGEN_ENABLED_SECTIONS}")
  # TODO: remove paths_doc from EXAMPLE_PATH after face module tutorials/samples moved to separate folders
  string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_EXAMPLE_PATH  "${example_path} ; ${paths_doc} ; ${paths_sample}")
+  string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_INCLUDE_ROOTS "${paths_include}")
  set(CMAKE_DOXYGEN_LAYOUT "${CMAKE_CURRENT_BINARY_DIR}/DoxygenLayout.xml")
  set(CMAKE_DOXYGEN_OUTPUT_PATH "doxygen")
  set(CMAKE_DOXYGEN_MAIN_REFERENCE "${refs_main}")
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -22,8 +22,8 @@ ABBREVIATE_BRIEF       = "The $name class" \
 ALWAYS_DETAILED_SEC    = NO
 INLINE_INHERITED_MEMB  = NO
 FULL_PATH_NAMES        = YES
-STRIP_FROM_PATH        = @CMAKE_SOURCE_DIR@/modules
-STRIP_FROM_INC_PATH    =
+STRIP_FROM_PATH        = @CMAKE_SOURCE_DIR@/modules @CMAKE_DOXYGEN_INCLUDE_ROOTS@
+STRIP_FROM_INC_PATH    = @CMAKE_DOXYGEN_INCLUDE_ROOTS@
 SHORT_NAMES            = NO
 JAVADOC_AUTOBRIEF      = NO
 QT_AUTOBRIEF           = NO
@ -72,8 +72,8 @@ INTERNAL_DOCS          = NO
 CASE_SENSE_NAMES       = YES
 HIDE_SCOPE_NAMES       = NO
 SHOW_INCLUDE_FILES     = YES
-SHOW_GROUPED_MEMB_INC  = NO
-FORCE_LOCAL_INCLUDES   = YES
+SHOW_GROUPED_MEMB_INC  = YES
+FORCE_LOCAL_INCLUDES   = NO
 INLINE_INFO            = YES
 SORT_MEMBER_DOCS       = YES
 SORT_BRIEF_DOCS        = YES
--- a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
+++ b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
@ -53,8 +53,8 @@ import numpy as np
 import cv2 as cv
 import matplotlib.pyplot as plt

-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage

 # Initiate ORB detector
 orb = cv.ORB_create()
@ -79,7 +79,7 @@ matches = bf.match(des1,des2)
 matches = sorted(matches, key = lambda x:x.distance)

 # Draw first 10 matches.
-img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10], flags=2)
+img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10],None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)

 plt.imshow(img3),plt.show()
@endcode
@ -104,13 +104,13 @@ so that we can apply ratio test explained by D.Lowe in his paper.
@code{.py}
 import numpy as np
 import cv2 as cv
-from matplotlib import pyplot as plt
+import matplotlib.pyplot as plt

-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage

 # Initiate SIFT detector
-sift = cv.SIFT()
+sift = cv.xfeatures2d.SIFT_create()

 # find the keypoints and descriptors with SIFT
 kp1, des1 = sift.detectAndCompute(img1,None)
@ -127,7 +127,7 @@ for m,n in matches:
        good.append([m])

 # cv.drawMatchesKnn expects list of lists as matches.
-img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,flags=2)
+img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)

 plt.imshow(img3),plt.show()
@endcode
@ -168,13 +168,13 @@ With this information, we are good to go.
@code{.py}
 import numpy as np
 import cv2 as cv
-from matplotlib import pyplot as plt
+import matplotlib.pyplot as plt

-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage

 # Initiate SIFT detector
-sift = cv.SIFT()
+sift = cv.xfeatures2d.SIFT_create()

 # find the keypoints and descriptors with SIFT
 kp1, des1 = sift.detectAndCompute(img1,None)
@ -190,7 +190,7 @@ flann = cv.FlannBasedMatcher(index_params,search_params)
 matches = flann.knnMatch(des1,des2,k=2)

 # Need to draw only good matches, so create a mask
-matchesMask = [[0,0] for i in xrange(len(matches))]
+matchesMask = [[0,0] for i in range(len(matches))]

 # ratio test as per Lowe's paper
 for i,(m,n) in enumerate(matches):
@ -200,7 +200,7 @@ for i,(m,n) in enumerate(matches):
 draw_params = dict(matchColor = (0,255,0),
                   singlePointColor = (255,0,0),
                   matchesMask = matchesMask,
-                   flags = 0)
+                   flags = cv.DrawMatchesFlags_DEFAULT)

 img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,matches,None,**draw_params)

--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@ -156,7 +156,7 @@ void CirclesGridClusterFinder::findGrid(const std::vector<cv::Point2f> &points,
 #endif

  std::vector<Point2f> hull2f;
-  convexHull(Mat(patternPoints), hull2f, false);
+  convexHull(patternPoints, hull2f, false);
  const size_t cornersCount = isAsymmetricGrid ? 6 : 4;
  if(hull2f.size() < cornersCount)
    return;
@ -407,7 +407,7 @@ void CirclesGridClusterFinder::rectifyPatternPoints(const std::vector<cv::Point2
    }
  }

-  Mat homography = findHomography(Mat(sortedCorners), Mat(idealPoints), 0);
+  Mat homography = findHomography(sortedCorners, idealPoints, 0);
  Mat rectifiedPointsMat;
  transform(patternPoints, rectifiedPointsMat, homography);
  rectifiedPatternPoints.clear();
@ -863,8 +863,8 @@ Mat CirclesGridFinder::rectifyGrid(Size detectedGridSize, const std::vector<Poin
    }
  }

-  Mat H = findHomography(Mat(centers), Mat(dstPoints), RANSAC);
-  //Mat H = findHomography( Mat( corners ), Mat( dstPoints ) );
+  Mat H = findHomography(centers, dstPoints, RANSAC);
+  //Mat H = findHomography(corners, dstPoints);

  if (H.empty())
  {
@ -880,7 +880,7 @@ Mat CirclesGridFinder::rectifyGrid(Size detectedGridSize, const std::vector<Poin
  }

  Mat dstKeypointsMat;
-  transform(Mat(srcKeypoints), dstKeypointsMat, H);
+  transform(srcKeypoints, dstKeypointsMat, H);
  std::vector<Point2f> dstKeypoints;
  convertPointsFromHomogeneous(dstKeypointsMat, dstKeypoints);

@ -1168,7 +1168,7 @@ void CirclesGridFinder::findBasis(const std::vector<Point2f> &samples, std::vect
  }
  for (size_t i = 0; i < basis.size(); i++)
  {
-    convexHull(Mat(clusters[i]), hulls[i]);
+    convexHull(clusters[i], hulls[i]);
  }

  basisGraphs.resize(basis.size(), Graph(keypoints.size()));
@ -1183,7 +1183,7 @@ void CirclesGridFinder::findBasis(const std::vector<Point2f> &samples, std::vect

      for (size_t k = 0; k < hulls.size(); k++)
      {
-        if (pointPolygonTest(Mat(hulls[k]), vec, false) >= 0)
+        if (pointPolygonTest(hulls[k], vec, false) >= 0)
        {
          basisGraphs[k].addEdge(i, j);
        }
@ -1414,7 +1414,6 @@ void CirclesGridFinder::drawHoles(const Mat &srcImage, Mat &drawImage) const
      if (i != holes.size() - 1)
        line(drawImage, keypoints[holes[i][j]], keypoints[holes[i + 1][j]], Scalar(255, 0, 0), 2);

-      //circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness);
      circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness);
    }
  }
--- a/modules/calib3d/src/homography_decomp.cpp
+++ b/modules/calib3d/src/homography_decomp.cpp
@ -185,6 +185,10 @@ bool HomographyDecompZhang::findMotionFrom_tstar_n(const cv::Vec3d& tstar, const
    temp(1, 1) += 1.0;
    temp(2, 2) += 1.0;
    motion.R = getHnorm() * temp.inv();
+    if (cv::determinant(motion.R) < 0)
+    {
+        motion.R *= -1;
+    }
    motion.t = motion.R * tstar;
    motion.n = n;
    return passesSameSideOfPlaneConstraint(motion);
@ -312,6 +316,10 @@ void HomographyDecompInria::findRmatFrom_tstar_n(const cv::Vec3d& tstar, const c
              0.0, 0.0, 1.0);

    R = getHnorm() * (I - (2/v) * tstar_m * n_m.t() );
+    if (cv::determinant(R) < 0)
+    {
+        R *= -1;
+    }
 }

 void HomographyDecompInria::decompose(std::vector<CameraMotion>& camMotions)
--- a/modules/calib3d/src/quadsubpix.cpp
+++ b/modules/calib3d/src/quadsubpix.cpp
@ -194,9 +194,8 @@ bool cv::find4QuadCornerSubpix(InputArray _img, InputOutputArray _corners, Size
        erode(white_comp, white_comp, Mat(), Point(-1, -1), erode_count);

        std::vector<std::vector<Point> > white_contours, black_contours;
-        std::vector<Vec4i> white_hierarchy, black_hierarchy;
-        findContours(black_comp, black_contours, black_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE);
-        findContours(white_comp, white_contours, white_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE);
+        findContours(black_comp, black_contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
+        findContours(white_comp, white_contours, RETR_LIST, CHAIN_APPROX_SIMPLE);

        if(black_contours.size() < 5 || white_contours.size() < 5) continue;

--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@ -1408,7 +1408,7 @@ bool CV_StereoCalibrationTest::checkPandROI( int test_case_idx, const Mat& M, co
        for( x = 0; x < N; x++ )
            pts.push_back(Point2f((float)x*imgsize.width/(N-1), (float)y*imgsize.height/(N-1)));

-    undistortPoints(Mat(pts), upts, M, D, R, P );
+    undistortPoints(pts, upts, M, D, R, P );
    for( k = 0; k < N*N; k++ )
        if( upts[k].x < -imgsize.width*eps || upts[k].x > imgsize.width*(1+eps) ||
            upts[k].y < -imgsize.height*eps || upts[k].y > imgsize.height*(1+eps) )
@ -1717,8 +1717,8 @@ void CV_StereoCalibrationTest::run( int )
        for( int i = 0, k = 0; i < nframes; i++ )
        {
            vector<Point2f> temp[2];
-            undistortPoints(Mat(imgpt1[i]), temp[0], M1, D1, R1, P1);
-            undistortPoints(Mat(imgpt2[i]), temp[1], M2, D2, R2, P2);
+            undistortPoints(imgpt1[i], temp[0], M1, D1, R1, P1);
+            undistortPoints(imgpt2[i], temp[1], M2, D2, R2, P2);

            for( int j = 0; j < npoints; j++, k++ )
            {
--- a/modules/calib3d/test/test_cameracalibration_artificial.cpp
+++ b/modules/calib3d/test/test_cameracalibration_artificial.cpp
@ -353,7 +353,7 @@ protected:
        rvecs_spnp.resize(brdsNum);
        tvecs_spnp.resize(brdsNum);
        for(size_t i = 0; i < brdsNum; ++i)
-            solvePnP(Mat(objectPoints[i]), Mat(imagePoints[i]), camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]);
+            solvePnP(objectPoints[i], imagePoints[i], camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]);

        compareShiftVecs(tvecs_exp, tvecs_spnp);
        compareRotationVecs(rvecs_exp, rvecs_spnp);
--- a/modules/calib3d/test/test_chessboardgenerator.cpp
+++ b/modules/calib3d/test/test_chessboardgenerator.cpp
@ -126,10 +126,10 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
                generateEdge(p3, p4, pts_square3d);
                generateEdge(p4, p1, pts_square3d);

-                projectPoints(Mat(pts_square3d), rvec, tvec, camMat, distCoeffs, pts_square2d);
+                projectPoints(pts_square3d, rvec, tvec, camMat, distCoeffs, pts_square2d);
                squares_black.resize(squares_black.size() + 1);
                vector<Point2f> temp;
-                approxPolyDP(Mat(pts_square2d), temp, 1.0, true);
+                approxPolyDP(pts_square2d, temp, 1.0, true);
                transform(temp.begin(), temp.end(), back_inserter(squares_black.back()), Mult(rendererResolutionMultiplier));
            }

@ -139,7 +139,7 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
        for(int i = 0; i < patternSize.width - 1; ++i)
            corners3d.push_back(zero + (i + 1) * sqWidth * pb1 + (j + 1) * sqHeight * pb2);
    corners.clear();
-    projectPoints(Mat(corners3d), rvec, tvec, camMat, distCoeffs, corners);
+    projectPoints(corners3d, rvec, tvec, camMat, distCoeffs, corners);

    vector<Point3f> whole3d;
    vector<Point2f> whole2d;
@ -147,9 +147,9 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
    generateEdge(whole[1], whole[2], whole3d);
    generateEdge(whole[2], whole[3], whole3d);
    generateEdge(whole[3], whole[0], whole3d);
-    projectPoints(Mat(whole3d), rvec, tvec, camMat, distCoeffs, whole2d);
+    projectPoints(whole3d, rvec, tvec, camMat, distCoeffs, whole2d);
    vector<Point2f> temp_whole2d;
-    approxPolyDP(Mat(whole2d), temp_whole2d, 1.0, true);
+    approxPolyDP(whole2d, temp_whole2d, 1.0, true);

    vector< vector<Point > > whole_contour(1);
    transform(temp_whole2d.begin(), temp_whole2d.end(),
@ -213,7 +213,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
        pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;

        /* can remake with better perf */
-        projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+        projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);

        bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0;
        bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0;
@ -278,7 +278,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
        pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;

        /* can remake with better perf */
-        projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+        projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);

        bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0;
        bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0;
@ -320,7 +320,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
    pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;

    /* can remake with better perf */
-    projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+    projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);

    Point3f zero = p - pb1 * cbHalfWidth - cbHalfHeight * pb2;

--- a/modules/calib3d/test/test_homography_decomp.cpp
+++ b/modules/calib3d/test/test_homography_decomp.cpp
@ -134,4 +134,36 @@ private:

 TEST(Calib3d_DecomposeHomography, regression) { CV_HomographyDecompTest test; test.safe_run(); }

+
+TEST(Calib3d_DecomposeHomography, issue_4978)
+{
+    Matx33d K(
+        1.0,   0.0,    0.0,
+        0.0,   1.0,    0.0,
+        0.0,   0.0,    1.0
+    );
+
+    Matx33d H(
+        -0.102896, 0.270191,   -0.0031153,
+        0.0406387, 1.19569,    -0.0120456,
+        0.445351,  0.0410889,  1
+    );
+
+    vector<Mat> rotations;
+    vector<Mat> translations;
+    vector<Mat> normals;
+
+    decomposeHomographyMat(H, K, rotations, translations, normals);
+
+    ASSERT_GT(rotations.size(), (size_t)0u);
+    for (size_t i = 0; i < rotations.size(); i++)
+    {
+        // check: det(R) = 1
+        EXPECT_TRUE(std::fabs(cv::determinant(rotations[i]) - 1.0) < 0.01)
+            << "R: det=" << cv::determinant(rotations[0]) << std::endl << rotations[i] << std::endl
+            << "T:" << std::endl << translations[i] << std::endl;
+    }
+}
+
+
 }} // namespace
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@ -124,7 +124,7 @@ protected:

        vector<Point2f> projectedPoints;
        projectedPoints.resize(points.size());
-        projectPoints(Mat(points), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+        projectPoints(points, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
        for (size_t i = 0; i < projectedPoints.size(); i++)
        {
            if (i % 20 == 0)
@ -241,7 +241,7 @@ protected:

        vector<Point2f> projectedPoints;
        projectedPoints.resize(opoints.size());
-        projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+        projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);

        bool isEstimateSuccess = solvePnP(opoints, projectedPoints, intrinsics, distCoeffs, rvec, tvec, false, method);
        if (isEstimateSuccess == false)
@ -291,7 +291,7 @@ class CV_solveP3P_Test : public CV_solvePnPRansac_Test

    vector<Point2f> projectedPoints;
    projectedPoints.resize(opoints.size());
-    projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+    projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);

    int num_of_solutions = solveP3P(opoints, projectedPoints, intrinsics, distCoeffs, rvecs, tvecs, method);
    if (num_of_solutions != (int) rvecs.size() || num_of_solutions != (int) tvecs.size() || num_of_solutions == 0)
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -186,6 +186,16 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 #  endif
 #endif

+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
 #if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
 #  define CV_ENABLE_UNROLLED 0
 #else
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP

 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20181221
+#define OPENCV_DNN_API_VERSION 20190122

 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -157,8 +157,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)

 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)))
+    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
               Mat(cv::Size(224, 224), CV_32FC3));
@ -211,8 +210,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)

 PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/dog416.png", false));
    Mat inp;
@ -222,8 +220,11 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)

 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+    if (backend == DNN_BACKEND_HALIDE
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE < 2018030000
+        || (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+#endif
+    )
        throw SkipTestException("");
    processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
 }
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -707,12 +707,6 @@ struct DataLayer : public Layer
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
-        InferenceEngine::LayerParams lp;
-        lp.name = name;
-        lp.type = "ScaleShift";
-        lp.precision = InferenceEngine::Precision::FP32;
-        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
-
        CV_CheckEQ(inputsData.size(), (size_t)1, "");
        CV_CheckEQ(inputsData[0].dims, 4, "");
        const size_t numChannels = inputsData[0].size[1];
@ -723,7 +717,6 @@ struct DataLayer : public Layer
                                                                {numChannels});
        weights->allocate();
        weights->set(std::vector<float>(numChannels, scaleFactors[0]));
-        ieLayer->_weights = weights;

        // Mean subtraction
        auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
@ -735,8 +728,21 @@ struct DataLayer : public Layer
            biasesVec[i] = -means[0][i] * scaleFactors[0];
        }
        biases->set(biasesVec);
-        ieLayer->_biases = biases;

+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+        ieLayer.setWeights(weights);
+        ieLayer.setBiases(biases);
+#else
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "ScaleShift";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
+
+        ieLayer->_weights = weights;
+        ieLayer->_biases = biases;
+#endif
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
@ -1480,7 +1486,11 @@ struct Net::Impl
                if (layerNet != ieInpNode->net)
                {
                    // layerNet is empty or nodes are from different graphs.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+                    ieInpNode->net->addOutput(ieInpNode->layer.getName());
+#else
                    ieInpNode->net->addOutput(ieInpNode->layer->name);
+#endif
                }
            }
        }
@ -1590,7 +1600,7 @@ struct Net::Impl

        // Build Inference Engine networks from sets of layers that support this
        // backend. Split a whole model on several Inference Engine networks if
-        // some of layers is not implemented.
+        // some of layers are not implemented.

        // Set of all input and output blobs wrappers for current network.
        std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
@ -1606,7 +1616,7 @@ struct Net::Impl
            {
                addInfEngineNetOutputs(ld);
                net = Ptr<InfEngineBackendNet>();
-                netBlobsWrappers.clear();
+                netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
                layer->preferableTarget = DNN_TARGET_CPU;
                continue;
            }
@ -1624,12 +1634,13 @@ struct Net::Impl
                    if (ieInpNode->net != net)
                    {
                        net = Ptr<InfEngineBackendNet>();
-                        netBlobsWrappers.clear();
+                        netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
                        break;
                    }
                }
            }

+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
            // The same blobs wrappers cannot be shared between two Inference Engine
            // networks because of explicit references between layers and blobs.
            // So we need to rewrap all the external blobs.
@ -1646,6 +1657,7 @@ struct Net::Impl
                    ld.inputBlobsWrappers[i] = it->second;
            }
            netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0];
+#endif  // IE < R5

            Ptr<BackendNode> node;
            if (!net.empty())
@ -1676,6 +1688,40 @@ struct Net::Impl
            CV_Assert(!ieNode.empty());
            ieNode->net = net;

+            // Convert weights in FP16 for specific targets.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+            if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
+                 preferableTarget == DNN_TARGET_MYRIAD ||
+                 preferableTarget == DNN_TARGET_FPGA) && !fused)
+            {
+                auto& blobs = ieNode->layer.getConstantData();
+                if (blobs.empty())
+                {
+                    // In case of non weightable layer we have to specify
+                    // it's precision adding dummy blob.
+                    auto blob = InferenceEngine::make_shared_blob<int16_t>(
+                                    InferenceEngine::Precision::FP16,
+                                    InferenceEngine::Layout::C, {1});
+                    blob->allocate();
+                    blobs[""] = blob;
+                }
+                else
+                {
+                    for (auto& it : blobs)
+                        it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
+                }
+            }
+
+            if (!fused)
+                net->addLayer(ieNode->layer);
+
+            net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
+            net->addBlobs(ld.inputBlobsWrappers);
+            net->addBlobs(ld.outputBlobsWrappers);
+            addInfEngineNetOutputs(ld);
+
+#else  // IE >= R5
+
            auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
            if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
                 preferableTarget == DNN_TARGET_MYRIAD ||
@ -1713,10 +1759,10 @@ struct Net::Impl
            if (!fused)
                net->addLayer(ieNode->layer);
            addInfEngineNetOutputs(ld);
+#endif  // IE >= R5
        }

        // Initialize all networks.
-        std::set<InfEngineBackendNet> initializedNets;
        for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
        {
            LayerData &ld = it->second;
@ -2622,7 +2668,11 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
    Net cvNet;
    cvNet.setInputsNames(inputsNames);

+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
+#else
    Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(0));
+#endif
    backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
    for (auto& it : ieNet.getOutputsInfo())
    {
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -349,6 +349,14 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+
+        const size_t numChannels = weights_.total();
+        ieLayer.setWeights(wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C));
+        ieLayer.setBiases(wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "ScaleShift";
@ -360,6 +368,7 @@ public:
        ieLayer->_biases = wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -110,6 +110,11 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::SplitLayer ieLayer(name);
+        ieLayer.setOutputPorts({InferenceEngine::Port()});
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
        CV_Assert(!input->dims.empty());

@ -123,6 +128,7 @@ public:
        ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]);
 #endif
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -313,6 +313,14 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::ConcatLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axis, input->dims.size()));
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
        InferenceEngine::LayerParams lp;
        lp.name = name;
@ -321,6 +329,7 @@ public:
        std::shared_ptr<InferenceEngine::ConcatLayer> ieLayer(new InferenceEngine::ConcatLayer(lp));
        ieLayer->_axis = clamp(axis, input->dims.size());
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -521,6 +521,54 @@ public:
        const int inpGroupCn = blobs[0].size[1];
        const int group = inpCn / inpGroupCn;

+        auto ieWeights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (newWeightAndBias)
+        {
+            if (weightsMat.isContinuous())
+            {
+                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
+                ieWeights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
+            }
+            else
+            {
+                ieWeights = InferenceEngine::make_shared_blob<float>(
+                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
+                                    ieWeights->dims());
+                ieWeights->allocate();
+
+                Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
+                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
+                fusedWeights.copyTo(newWeights);
+            }
+        }
+        InferenceEngine::Blob::Ptr ieBiases;
+        if (hasBias() || fusedBias)
+        {
+            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
+            ieBiases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C);
+        }
+
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ConvolutionLayer ieLayer(name);
+
+        ieLayer.setKernel({kernel.height, kernel.width});
+        ieLayer.setStrides({stride.height, stride.width});
+        ieLayer.setDilation({dilation.height, dilation.width});
+        ieLayer.setPaddingsBegin({pad.height, pad.width});
+        ieLayer.setPaddingsEnd({pad.height, pad.width});
+        ieLayer.setGroup(group);
+        ieLayer.setOutDepth(outCn);
+
+        ieLayer.setWeights(ieWeights);
+        if (ieBiases)
+            ieLayer.setBiases(ieBiases);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!padMode.empty())
+            l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Convolution";
@ -557,32 +605,11 @@ public:
        ieLayer->_out_depth = outCn;
        ieLayer->_group = group;

-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
-        if (newWeightAndBias)
-        {
-            if (weightsMat.isContinuous())
-            {
-                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
-                ieLayer->_weights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
-            }
-            else
-            {
-                ieLayer->_weights = InferenceEngine::make_shared_blob<float>(
-                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
-                                    ieLayer->_weights->dims());
-                ieLayer->_weights->allocate();
-
-                Mat newWeights = infEngineBlobToMat(ieLayer->_weights).reshape(1, outCn);
-                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
-                fusedWeights.copyTo(newWeights);
-            }
-        }
-        if (hasBias() || fusedBias)
-        {
-            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
-            ieLayer->_biases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C);
-        }
+        ieLayer->_weights = ieWeights;
+        if (ieBiases)
+            ieLayer->_biases = ieBiases;
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
@ -1193,6 +1220,9 @@ public:
 #ifdef HAVE_INF_ENGINE
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        {
+            if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width))
+                return false;
+
            const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
            const int group = numOutput / outGroupCn;
            if (group != 1)
@ -1747,6 +1777,27 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
+        const int group = numOutput / outGroupCn;
+
+        InferenceEngine::Builder::DeconvolutionLayer ieLayer(name);
+
+        ieLayer.setKernel({kernel.height, kernel.width});
+        ieLayer.setStrides({stride.height, stride.width});
+        ieLayer.setDilation({dilation.height, dilation.width});
+        ieLayer.setPaddingsBegin({pad.height, pad.width});
+        ieLayer.setPaddingsEnd({pad.height, pad.width});
+        ieLayer.setGroup(group);
+        ieLayer.setOutDepth(numOutput);
+
+        ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW));
+        if (hasBias())
+        {
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C));
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
        const int group = numOutput / outGroupCn;

@ -1786,6 +1837,7 @@ public:
            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C);
        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@ -67,8 +67,12 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE && crop_ranges.size() == 4);
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) && crop_ranges.size() == 4;
+        else
+#endif
+            return backendId == DNN_BACKEND_OPENCV;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -145,9 +149,10 @@ public:
        input(&crop_ranges[0]).copyTo(outputs[0]);
    }

+#ifdef HAVE_INF_ENGINE
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
-#ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Crop";
@ -181,9 +186,11 @@ public:
        ieLayer->dim.push_back(crop_ranges[3].end - crop_ranges[3].start);
 #endif
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-#endif  // HAVE_INF_ENGINE
+#else
        return Ptr<BackendNode>();
+#endif  // IE < R5
    }
+#endif

    std::vector<Range> crop_ranges;
 };
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -939,6 +939,25 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::DetectionOutputLayer ieLayer(name);
+
+        ieLayer.setNumClasses(_numClasses);
+        ieLayer.setShareLocation(_shareLocation);
+        ieLayer.setBackgroudLabelId(_backgroundLabelId);
+        ieLayer.setNMSThreshold(_nmsThreshold);
+        ieLayer.setTopK(_topK);
+        ieLayer.setKeepTopK(_keepTopK);
+        ieLayer.setConfidenceThreshold(_confidenceThreshold);
+        ieLayer.setVariantEncodedInTarget(_varianceEncodedInTarget);
+        ieLayer.setCodeType("caffe.PriorBoxParameter." + _codeType);
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(3));
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["eta"] = std::string("1.0");
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "DetectionOutput";
@ -956,6 +975,7 @@ public:
        ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
        ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -153,10 +153,16 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer = func.initInfEngineBuilderAPI();
+        ieLayer.setName(this->name);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = this->name;
        lp.precision = InferenceEngine::Precision::FP32;
        return Ptr<BackendNode>(new InfEngineBackendNode(func.initInfEngine(lp)));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
@ -355,6 +361,12 @@ struct ReLUFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(slope);
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "ReLU";
@ -363,6 +375,7 @@ struct ReLUFunctor
        ieLayer->params["negative_slope"] = format("%f", slope);
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -472,6 +485,12 @@ struct ReLU6Functor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ClampLayer("").setMinValue(minValue).setMaxValue(maxValue);
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "Clamp";
@ -482,6 +501,7 @@ struct ReLU6Functor
        ieLayer->params["max"] = format("%f", maxValue);
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -558,12 +578,19 @@ struct TanHFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::TanHLayer("");
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "TanH";
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -640,12 +667,19 @@ struct SigmoidFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::SigmoidLayer("");
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "Sigmoid";
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -724,11 +758,18 @@ struct ELUFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ELULayer("");
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "ELU";
        return InferenceEngine::CNNLayerPtr(new InferenceEngine::CNNLayer(lp));
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -805,6 +846,12 @@ struct AbsValFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(-1);
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "ReLU";
@ -813,6 +860,7 @@ struct AbsValFunctor
        ieLayer->params["negative_slope"] = "-1.0";
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -868,11 +916,18 @@ struct BNLLFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        CV_Error(Error::StsNotImplemented, "BNLL");
        return InferenceEngine::CNNLayerPtr();
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -985,6 +1040,14 @@ struct PowerFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::PowerLayer("").setPower(power)
+                                                       .setScale(scale)
+                                                       .setShift(shift);
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        if (power == 1.0f && scale == 1.0f && shift == 0.0f)
@ -1004,6 +1067,7 @@ struct PowerFunctor
            return ieLayer;
        }
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
@ -1143,6 +1207,15 @@ struct ChannelsPReLUFunctor
 #endif  // HAVE_HALIDE

 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        InferenceEngine::Builder::PReLULayer ieLayer("");
+        const size_t numChannels = scale.total();
+        ieLayer.setWeights(wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C));
+        return ieLayer;
+    }
+#else
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
        lp.type = "PReLU";
@ -1151,6 +1224,7 @@ struct ChannelsPReLUFunctor
        ieLayer->_weights = wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C);
        return ieLayer;
    }
+#endif
 #endif  // HAVE_INF_ENGINE

 #ifdef HAVE_VULKAN
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -99,7 +99,7 @@ public:
        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE ||
               (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
-                (preferableTarget != DNN_TARGET_MYRIAD || coeffs.empty()));
+                (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -420,9 +420,29 @@ public:
        return Ptr<BackendNode>();
    }

-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::EltwiseLayer ieLayer(name);
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+
+        if (op == SUM)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
+        else if (op == PROD)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
+        else if (op == MAX)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!coeffs.empty())
+            l.getParameters()["coeff"] = coeffs;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Eltwise";
@ -438,6 +458,7 @@ public:
        else
            CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -152,9 +152,19 @@ public:
        }
    }

-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Flatten");
+        ieLayer.getParameters()["axis"] = _startAxis;
+        ieLayer.getParameters()["end_axis"] = _endAxis;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Flatten";
@ -163,6 +173,7 @@ public:
        ieLayer->params["axis"] = format("%d", _startAxis);
        ieLayer->params["end_axis"] = format("%d", _endAxis);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -442,6 +442,18 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
+
+        const int outNum = blobs[0].size[0];
+        ieLayer.setOutputNum(outNum);
+
+        ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW));
+        if (blobs.size() > 1)
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C));
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "FullyConnected";
@ -456,6 +468,7 @@ public:
        if (blobs.size() > 1)
            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -393,6 +393,17 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::NormLayer ieLayer(name);
+        ieLayer.setSize(size);
+        ieLayer.setAlpha(alpha);
+        ieLayer.setBeta(beta);
+        ieLayer.setAcrossMaps(type == CHANNEL_NRM);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["k"] = bias;
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Norm";
@ -405,6 +416,7 @@ public:
        ieLayer->_alpha = alpha;
        ieLayer->_isAcrossMaps = (type == CHANNEL_NRM);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -371,6 +371,13 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::MVNLayer ieLayer(name);
+        ieLayer.setAcrossChannels(acrossChannels);
+        ieLayer.setNormalize(normVariance);
+        ieLayer.setEpsilon(eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "MVN";
@ -380,6 +387,7 @@ public:
        ieLayer->params["normalize_variance"] = normVariance ? "1" : "0";
        ieLayer->params["eps"] = format("%f", eps);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -264,6 +264,49 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        if (input->dims.size() == 4)
+        {
+            InferenceEngine::Builder::NormalizeLayer ieLayer(name);
+
+            ieLayer.setChannelShared(false);
+            ieLayer.setAcrossMaps(acrossSpatial);
+            ieLayer.setEpsilon(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            const int numChannels = input->dims[2];  // NOTE: input->dims are reversed (whcn)
+            if (blobs.empty())
+            {
+                auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                                        InferenceEngine::Layout::C,
+                                                                        {(size_t)numChannels});
+                weights->allocate();
+                std::vector<float> ones(numChannels, 1);
+                weights->set(ones);
+                l.addConstantData("weights", weights);
+                l.getParameters()["channel_shared"] = false;
+            }
+            else
+            {
+                CV_Assert(numChannels == blobs[0].total());
+                l.addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)numChannels}, InferenceEngine::Layout::C));
+                l.getParameters()["channel_shared"] = blobs[0].total() == 1;
+            }
+            l.getParameters()["across_spatial"] = acrossSpatial;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::GRNLayer ieLayer(name);
+            ieLayer.setBeta(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            l.getParameters()["bias"] = epsilon;
+
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+#else
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);

        InferenceEngine::LayerParams lp;
@ -307,6 +350,7 @@ public:
            ieLayer->params["bias"] = format("%f", epsilon);
            return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
        }
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -385,6 +385,11 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::PermuteLayer ieLayer(name);
+        ieLayer.setOrder(_order);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Permute";
@ -397,6 +402,7 @@ public:
            ieLayer->params["order"] += format(",%zu", _order[i]);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -295,6 +295,48 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        if (type == MAX || type == AVE)
+        {
+            InferenceEngine::Builder::PoolingLayer ieLayer(name);
+            ieLayer.setKernel({kernel.height, kernel.width});
+            ieLayer.setStrides({stride.height, stride.width});
+            ieLayer.setPaddingsBegin({pad_t, pad_l});
+            ieLayer.setPaddingsEnd({pad_b, pad_r});
+            ieLayer.setPoolingType(type == MAX ?
+                                   InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
+                                   InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
+            ieLayer.setRoundingType(ceilMode ?
+                                    InferenceEngine::Builder::PoolingLayer::RoundingType::CEIL :
+                                    InferenceEngine::Builder::PoolingLayer::RoundingType::FLOOR);
+            ieLayer.setExcludePad(type == AVE && padMode == "SAME");
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (!padMode.empty())
+                l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else if (type == ROI)
+        {
+            InferenceEngine::Builder::ROIPoolingLayer ieLayer(name);
+            ieLayer.setSpatialScale(spatialScale);
+            ieLayer.setPooled({pooledSize.height, pooledSize.width});
+            ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+            return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+        }
+        else if (type == PSROI)
+        {
+            InferenceEngine::Builder::PSROIPoolingLayer ieLayer(name);
+            ieLayer.setSpatialScale(spatialScale);
+            ieLayer.setOutputDim(psRoiOutChannels);
+            ieLayer.setGroupSize(pooledSize.width);
+            ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+            return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
+        return Ptr<BackendNode>();
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.precision = InferenceEngine::Precision::FP32;
@ -353,6 +395,7 @@ public:
            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -498,6 +498,58 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        if (_explicitSizes)
+        {
+            InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name);
+
+            CV_Assert(_stepX == _stepY);
+            ieLayer.setStep(_stepX);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            l.getParameters()["width"] = _boxWidths;
+            l.getParameters()["height"] = _boxHeights;
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::PriorBoxLayer ieLayer(name);
+
+            CV_Assert(!_explicitSizes);
+
+            ieLayer.setMinSize(_minSize);
+            if (_maxSize > 0)
+                ieLayer.setMaxSize(_maxSize);
+
+            CV_Assert(_stepX == _stepY);
+            ieLayer.setStep(_stepX);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (!_aspectRatios.empty())
+            {
+                l.getParameters()["aspect_ratio"] = _aspectRatios;
+            }
+            CV_Assert(!_variance.empty());
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = _explicitSizes ? "PriorBoxClustered" : "PriorBox";
@ -553,6 +605,7 @@ public:
        ieLayer->params["offset"] = format("%f", _offsetsX[0]);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@ -328,6 +328,28 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ProposalLayer ieLayer(name);
+
+        ieLayer.setBaseSize(baseSize);
+        ieLayer.setFeatStride(featStride);
+        ieLayer.setMinSize(16);
+        ieLayer.setNMSThresh(nmsThreshold);
+        ieLayer.setPostNMSTopN(keepTopAfterNMS);
+        ieLayer.setPreNMSTopN(keepTopBeforeNMS);
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        ieLayer.setScale(scalesVec);
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        ieLayer.setRatio(ratiosVec);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Proposal";
@ -353,6 +375,7 @@ public:
                ieLayer->params["scale"] += format(",%f", scales.get<float>(i));
        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@ -181,6 +181,11 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ReorgYoloLayer ieLayer(name);
+        ieLayer.setStride(reorgStride);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "ReorgYolo";
@ -188,6 +193,7 @@ public:
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
        ieLayer->params["stride"] = format("%d", reorgStride);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@ -203,6 +203,17 @@ public:
        return true;
    }

+    void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!outputs.empty());
+        outShapes.resize(outputs.size());
+        for (int i = 0; i < outputs.size(); ++i)
+            outShapes[i] = shape(outputs[i]);
+    }
+
    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
        std::vector<UMat> inputs;
@ -218,8 +229,7 @@ public:
            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
            if (src_handle != dst_handle)
            {
-                MatShape outShape = shape(outputs[i]);
-                UMat umat = srcBlob.reshape(1, (int)outShape.size(), &outShape[0]);
+                UMat umat = srcBlob.reshape(1, (int)outShapes[i].size(), &outShapes[i][0]);
                umat.copyTo(outputs[i]);
            }
        }
@ -250,6 +260,12 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ReshapeLayer ieLayer(name);
+        CV_Assert(outShapes.size() == 1);
+        ieLayer.setDims(outShapes[0]);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Reshape";
@ -265,9 +281,13 @@ public:
            ieLayer->shape = std::vector<int>(shapeSrc->dims.rbegin(), shapeSrc->dims.rend());
        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
+
+private:
+    std::vector<MatShape> outShapes;
 };

 Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -163,6 +163,33 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (interpolation == "nearest")
+        {
+            ieLayer.setType("Resample");
+            ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST");
+            ieLayer.getParameters()["antialias"] = false;
+            if (scaleWidth != scaleHeight)
+                CV_Error(Error::StsNotImplemented, "resample with sw != sh");
+            ieLayer.getParameters()["factor"] = 1.0 / scaleWidth;
+        }
+        else if (interpolation == "bilinear")
+        {
+            ieLayer.setType("Interp");
+            ieLayer.getParameters()["pad_beg"] = 0;
+            ieLayer.getParameters()["pad_end"] = 0;
+            ieLayer.getParameters()["align_corners"] = false;
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.precision = InferenceEngine::Precision::FP32;
@ -187,6 +214,7 @@ public:
        ieLayer->params["width"] = cv::format("%d", outWidth);
        ieLayer->params["height"] = cv::format("%d", outHeight);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
@ -247,6 +275,18 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Interp");
+        ieLayer.getParameters()["pad_beg"] = 0;
+        ieLayer.getParameters()["pad_end"] = 0;
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Interp";
@ -256,6 +296,7 @@ public:
        ieLayer->params["pad_beg"] = "0";
        ieLayer->params["pad_end"] = "0";
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -197,6 +197,29 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+
+        CV_Assert(!blobs.empty());
+        const size_t numChannels = blobs[0].total();
+        if (hasWeights)
+        {
+            ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C));
+        }
+        else
+        {
+            auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                                    {numChannels});
+            weights->allocate();
+
+            std::vector<float> ones(numChannels, 1);
+            weights->set(ones);
+            ieLayer.setWeights(weights);
+        }
+        if (hasBias)
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "ScaleShift";
@ -223,6 +246,7 @@ public:
            ieLayer->_biases = wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -110,8 +110,15 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1 && sliceRanges[0].size() == 4);
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+        {
+            return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) &&
+                   sliceRanges.size() == 1 && sliceRanges[0].size() == 4;
+        }
+        else
+#endif
+            return backendId == DNN_BACKEND_OPENCV;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -254,9 +261,10 @@ public:
        }
    }

+#ifdef HAVE_INF_ENGINE
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
-#ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
        InferenceEngine::LayerParams lp;
        lp.name = name;
@ -286,10 +294,11 @@ public:
            ieLayer->dim.push_back(sliceRanges[0][i].end - sliceRanges[0][i].start);
        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-
-#endif  // HAVE_INF_ENGINE
+#else
        return Ptr<BackendNode>();
+#endif  // IE < R5
    }
+#endif
 };

 Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -326,6 +326,13 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axisRaw, input->dims.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);

        InferenceEngine::LayerParams lp;
@ -335,6 +342,7 @@ public:
        std::shared_ptr<InferenceEngine::SoftMaxLayer> ieLayer(new InferenceEngine::SoftMaxLayer(lp));
        ieLayer->axis = clamp(axisRaw, input->dims.size());
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
    }
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -18,6 +18,10 @@ namespace cv { namespace dnn {

 #ifdef HAVE_INF_ENGINE

+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::Builder::Layer& _layer)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}
+#else
 InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& _layer)
    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}

@ -40,6 +44,7 @@ void InfEngineBackendNode::connect(std::vector<Ptr<BackendWrapper> >& inputs,
    layer->outData[0] = dataPtr;
    dataPtr->creatorLayer = InferenceEngine::CNNLayerWeakPtr(layer);
 }
+#endif

 static std::vector<Ptr<InfEngineBackendWrapper> >
 infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
@ -54,6 +59,129 @@ infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
    return wrappers;
 }

+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+
+InfEngineBackendNet::InfEngineBackendNet() : netBuilder("")
+{
+    hasNetOwner = false;
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+}
+
+InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net)
+{
+    hasNetOwner = true;
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+}
+
+void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                  const std::vector<Ptr<BackendWrapper> >& outputs,
+                                  const std::string& layerName)
+{
+    std::vector<Ptr<InfEngineBackendWrapper> > inpWrappers = infEngineWrappers(inputs);
+    std::map<std::string, int>::iterator it = layers.find(layerName);
+    CV_Assert(it != layers.end());
+
+    const int layerId = it->second;
+    for (int i = 0; i < inpWrappers.size(); ++i)
+    {
+        const auto& inp = inpWrappers[i];
+        const std::string& inpName = inp->dataPtr->name;
+        int inpId;
+        it = layers.find(inpName);
+        if (it == layers.end())
+        {
+            InferenceEngine::Builder::InputLayer inpLayer(inpName);
+
+            std::vector<size_t> shape(inp->blob->dims());
+            std::reverse(shape.begin(), shape.end());
+
+            inpLayer.setPort(InferenceEngine::Port(shape));
+            inpId = netBuilder.addLayer(inpLayer);
+
+            layers.insert({inpName, inpId});
+        }
+        else
+            inpId = it->second;
+
+        netBuilder.connect(inpId, {layerId, i});
+        unconnectedLayersIds.erase(inpId);
+    }
+    CV_Assert(!outputs.empty());
+    InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
+    dataPtr->name = layerName;
+}
+
+void InfEngineBackendNet::init(int targetId)
+{
+    if (!hasNetOwner)
+    {
+        CV_Assert(!unconnectedLayersIds.empty());
+        for (int id : unconnectedLayersIds)
+        {
+            InferenceEngine::Builder::OutputLayer outLayer("myconv1");
+            netBuilder.addLayer({id}, outLayer);
+        }
+        cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build()));
+    }
+
+    switch (targetId)
+    {
+    case DNN_TARGET_CPU:
+        targetDevice = InferenceEngine::TargetDevice::eCPU;
+        break;
+    case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16:
+        targetDevice = InferenceEngine::TargetDevice::eGPU;
+        break;
+    case DNN_TARGET_MYRIAD:
+        targetDevice = InferenceEngine::TargetDevice::eMYRIAD;
+        break;
+    case DNN_TARGET_FPGA:
+        targetDevice = InferenceEngine::TargetDevice::eFPGA;
+        break;
+    default:
+        CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
+    }
+
+    for (const auto& name : requestedOutputs)
+    {
+        cnn.addOutput(name);
+    }
+
+    for (const auto& it : cnn.getInputsInfo())
+    {
+        const std::string& name = it.first;
+        auto blobIt = allBlobs.find(name);
+        CV_Assert(blobIt != allBlobs.end());
+        inpBlobs[name] = blobIt->second;
+        it.second->setPrecision(blobIt->second->precision());
+    }
+    for (const auto& it : cnn.getOutputsInfo())
+    {
+        const std::string& name = it.first;
+        auto blobIt = allBlobs.find(name);
+        CV_Assert(blobIt != allBlobs.end());
+        outBlobs[name] = blobIt->second;
+        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
+    }
+
+    initPlugin(cnn);
+}
+
+void InfEngineBackendNet::addLayer(const InferenceEngine::Builder::Layer& layer)
+{
+    int id = netBuilder.addLayer(layer);
+    const std::string& layerName = layer.getName();
+    CV_Assert(layers.insert({layerName, id}).second);
+    unconnectedLayersIds.insert(id);
+}
+
+void InfEngineBackendNet::addOutput(const std::string& name)
+{
+    requestedOutputs.push_back(name);
+}
+
+#endif  // IE >= R5
+
 static InferenceEngine::Layout estimateLayout(const Mat& m)
 {
    if (m.dims == 4)
@ -148,6 +276,7 @@ void InfEngineBackendWrapper::setHostDirty()

 }

+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
 InfEngineBackendNet::InfEngineBackendNet()
 {
    targetDevice = InferenceEngine::TargetDevice::eCPU;
@ -491,6 +620,8 @@ void InfEngineBackendNet::init(int targetId)
        initPlugin(*this);
 }

+#endif  // IE < R5
+
 static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;

 void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
@ -566,7 +697,11 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
    auto wrappers = infEngineWrappers(ptrs);
    for (const auto& wrapper : wrappers)
    {
-        allBlobs.insert({wrapper->dataPtr->name, wrapper->blob});
+        std::string name = wrapper->dataPtr->name;
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        name = name.empty() ? "id1" : name;  // TODO: drop the magic input name.
+#endif
+        allBlobs.insert({name, wrapper->blob});
    }
 }

--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -35,6 +35,11 @@

 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
 #define INF_ENGINE_VER_MAJOR_GE(ver) (((INF_ENGINE_RELEASE) / 10000) >= ((ver) / 10000))
+#define INF_ENGINE_VER_MAJOR_LT(ver) (((INF_ENGINE_RELEASE) / 10000) < ((ver) / 10000))
+
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+#include <ie_builders.hpp>
+#endif

 #endif  // HAVE_INF_ENGINE

@ -42,6 +47,7 @@ namespace cv { namespace dnn {

 #ifdef HAVE_INF_ENGINE

+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
 class InfEngineBackendNet : public InferenceEngine::ICNNNetwork
 {
 public:
@ -146,17 +152,75 @@ private:
    void initPlugin(InferenceEngine::ICNNNetwork& net);
 };

+#else  // IE < R5
+
+class InfEngineBackendNet
+{
+public:
+    InfEngineBackendNet();
+
+    InfEngineBackendNet(InferenceEngine::CNNNetwork& net);
+
+    void addLayer(const InferenceEngine::Builder::Layer& layer);
+
+    void addOutput(const std::string& name);
+
+    void connect(const std::vector<Ptr<BackendWrapper> >& inputs,
+                 const std::vector<Ptr<BackendWrapper> >& outputs,
+                 const std::string& layerName);
+
+    bool isInitialized();
+
+    void init(int targetId);
+
+    void forward();
+
+    void initPlugin(InferenceEngine::ICNNNetwork& net);
+
+    void addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs);
+
+private:
+    InferenceEngine::Builder::Network netBuilder;
+
+    InferenceEngine::InferenceEnginePluginPtr enginePtr;
+    InferenceEngine::InferencePlugin plugin;
+    InferenceEngine::ExecutableNetwork netExec;
+    InferenceEngine::InferRequest infRequest;
+    InferenceEngine::BlobMap allBlobs;
+    InferenceEngine::BlobMap inpBlobs;
+    InferenceEngine::BlobMap outBlobs;
+    InferenceEngine::TargetDevice targetDevice;
+
+    InferenceEngine::CNNNetwork cnn;
+    bool hasNetOwner;
+
+    std::map<std::string, int> layers;
+    std::vector<std::string> requestedOutputs;
+
+    std::set<int> unconnectedLayersIds;
+};
+#endif  // IE < R5
+
 class InfEngineBackendNode : public BackendNode
 {
 public:
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InfEngineBackendNode(const InferenceEngine::Builder::Layer& layer);
+#else
    InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& layer);
+#endif

    void connect(std::vector<Ptr<BackendWrapper> >& inputs,
                 std::vector<Ptr<BackendWrapper> >& outputs);

-    InferenceEngine::CNNLayerPtr layer;
    // Inference Engine network object that allows to obtain the outputs of this layer.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer layer;
    Ptr<InfEngineBackendNet> net;
+#else
+    InferenceEngine::CNNLayerPtr layer;
+    Ptr<InfEngineBackendNet> net;
+#endif
 };

 class InfEngineBackendWrapper : public BackendWrapper
--- a/modules/dnn/src/torch/THGeneral.cpp
+++ b/modules/dnn/src/torch/THGeneral.cpp
@ -1,10 +1,2 @@
 #include "../precomp.hpp"
-
-#if defined(TH_DISABLE_HEAP_TRACKING)
-#elif (defined(__unix) || defined(_WIN32))
-#include <malloc.h>
-#elif defined(__APPLE__)
-#include <malloc/malloc.h>
-#endif
-
 #include "THGeneral.h"
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -180,7 +180,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
-    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 0.0;
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 2e-5;
    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
               inp, "detection_out", "", l1, lInf, 0.25);
@ -288,7 +288,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
    Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
    // Output image has values in range [-143.526, 148.539].
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.3 : 4e-5;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.0 : 2e-3;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.28 : 2e-3;
    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
 }

--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -306,7 +306,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
    // batch size 1
    testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);

-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_MYRIAD)
 #endif
    // batch size 2
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -163,7 +163,7 @@ TEST_P(Deconvolution, Accuracy)
    bool hasBias = get<6>(GetParam());
    Backend backendId = get<0>(get<7>(GetParam()));
    Target targetId = get<1>(get<7>(GetParam()));
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU &&
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_MYRIAD) &&
        dilation.width == 2 && dilation.height == 2)
        throw SkipTestException("");
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000
@ -466,6 +466,7 @@ void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId)
    pool.set("stride_w", 2);
    pool.set("stride_h", 2);
    pool.type = "Pooling";
+    pool.name = "ave_pool";

    Net net;
    int poolId = net.addLayer(pool.name, pool.type, pool);
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -295,10 +295,6 @@ TEST_P(Test_Caffe_layers, Eltwise)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
-#endif
    testLayerUsingCaffeModels("layer_eltwise");
 }

--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -351,6 +351,10 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
        l1 = 0.009;
        lInf = 0.035;
    }
+    else if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU) {
+        l1 = 4.5e-5;
+        lInf = 1.9e-4;
+    }
    testONNXModels("LResNet100E_IR", pb, l1, lInf);
 }

@ -366,6 +370,10 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
        l1 = 0.021;
        lInf = 0.034;
    }
+    else if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_CPU || target == DNN_TARGET_OPENCL)) {
+        l1 = 2.4e-4;
+        lInf = 6e-4;
+    }
    testONNXModels("emotion_ferplus", pb, l1, lInf);
 }

@ -389,7 +397,7 @@ TEST_P(Test_ONNX_nets, Inception_v1)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
+        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
 #endif
    testONNXModels("inception_v1", pb);
 }
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -40,7 +40,7 @@ TEST(Test_TensorFlow, read_inception)
    ASSERT_TRUE(!sample.empty());
    Mat input;
    resize(sample, input, Size(224, 224));
-    input -= 128; // mean sub
+    input -= Scalar::all(117); // mean sub

    Mat inputBlob = blobFromImage(input);

@ -351,8 +351,8 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
    Mat out = net.forward();

    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
-    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1e-5;
-    float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0098 : 1e-3;
+    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1.5e-5;
+    float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 1e-3;
    normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff);
 }

@ -366,6 +366,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");

+    double scoresDiff = backend == DNN_BACKEND_INFERENCE_ENGINE ? 2.9e-5 : 1e-5;
    for (int i = 0; i < 2; ++i)
    {
        std::string proto = findDataFile("dnn/" + names[i] + ".pbtxt", false);
@ -381,7 +382,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
        Mat out = net.forward();

        Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/" + names[i] + ".detection_out.npy"));
-        normAssertDetections(ref, out, names[i].c_str(), 0.3);
+        normAssertDetections(ref, out, names[i].c_str(), 0.3, scoresDiff);
    }
 }

@ -406,7 +407,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
    net.setInput(blob);
    Mat out = net.forward();

-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : default_l1;
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 1.1e-5;
    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.021 : default_lInf;
    normAssertDetections(ref, out, "", 0.4, scoreDiff, iouDiff);
 }
@ -568,10 +569,6 @@ TEST_P(Test_TensorFlow_layers, slice)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
-#endif
    runTensorFlowNet("slice_4d");
 }

--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -260,6 +260,11 @@ TEST_P(Test_Torch_layers, run_paralel)

 TEST_P(Test_Torch_layers, net_residual)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL ||
+                                                    target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
+#endif
    runTorchNet("net_residual", "", false, true);
 }

@ -390,10 +395,6 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
 //   -model models/instance_norm/feathers.t7
 TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
-#endif
    checkBackend();
    std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7",
                            "dnn/fast_neural_style_instance_norm_feathers.t7"};
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@ -197,8 +197,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
    centers.clear();

    std::vector < std::vector<Point> > contours;
-    Mat tmpBinaryImage = binaryImage.clone();
-    findContours(tmpBinaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE);
+    findContours(binaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE);

 #ifdef DEBUG_BLOB_DETECTOR
    //  Mat keypointsImage;
@ -214,7 +213,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
    {
        Center center;
        center.confidence = 1;
-        Moments moms = moments(Mat(contours[contourIdx]));
+        Moments moms = moments(contours[contourIdx]);
        if (params.filterByArea)
        {
            double area = moms.m00;
@ -225,7 +224,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
        if (params.filterByCircularity)
        {
            double area = moms.m00;
-            double perimeter = arcLength(Mat(contours[contourIdx]), true);
+            double perimeter = arcLength(contours[contourIdx], true);
            double ratio = 4 * CV_PI * area / (perimeter * perimeter);
            if (ratio < params.minCircularity || ratio >= params.maxCircularity)
                continue;
@ -261,9 +260,9 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
        if (params.filterByConvexity)
        {
            std::vector < Point > hull;
-            convexHull(Mat(contours[contourIdx]), hull);
-            double area = contourArea(Mat(contours[contourIdx]));
-            double hullArea = contourArea(Mat(hull));
+            convexHull(contours[contourIdx], hull);
+            double area = contourArea(contours[contourIdx]);
+            double hullArea = contourArea(hull);
            if (fabs(hullArea) < DBL_EPSILON)
                continue;
            double ratio = area / hullArea;
--- a/modules/imgproc/perf/perf_contours.cpp
+++ b/modules/imgproc/perf/perf_contours.cpp
@ -84,4 +84,26 @@ PERF_TEST_P(TestFindContoursFF, findContours,
    SANITY_CHECK_NOTHING();
 }

+typedef TestBaseWithParam< tuple<MatDepth, int> > TestBoundingRect;
+
+PERF_TEST_P(TestBoundingRect, BoundingRect,
+    Combine(
+        testing::Values(CV_32S, CV_32F), // points type
+        Values(400, 511, 1000, 10000, 100000) // points count
+    )
+)
+
+{
+    int ptType = get<0>(GetParam());
+    int n = get<1>(GetParam());
+
+    Mat pts(n, 2, ptType);
+    declare.in(pts, WARMUP_RNG);
+
+    cv::Rect rect;
+    TEST_CYCLE() rect = boundingRect(pts);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } } // namespace
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatD
 PERF_TEST_P(Size_MatType_OutMatDepth, integral,
            testing::Combine(
                testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(CV_8UC1, CV_8UC3, CV_8UC4),
                testing::Values(CV_32S, CV_32F, CV_64F)
                )
            )
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs
    }

    // adjust bufstep so that the used part of the ring buffer stays compact in memory
-    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
+    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);

    dx1 = std::max(anchor.x - roi.x, 0);
    dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@ -11,16 +11,6 @@

 #include "opencv2/core/softfloat.hpp"

-#ifndef CV_ALWAYS_INLINE
-    #if defined(__GNUC__) && (__GNUC__ > 3 ||(__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-        #define CV_ALWAYS_INLINE inline __attribute__((always_inline))
-    #elif defined(_MSC_VER)
-        #define CV_ALWAYS_INLINE __forceinline
-    #else
-        #define CV_ALWAYS_INLINE inline
-    #endif
-#endif
-
 namespace
 {

--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@ -45,6 +45,7 @@
 #include "opencl_kernels_imgproc.hpp"
 #include <iostream>
 #include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include <opencv2/core/utils/configuration.private.hpp>

 /****************************************************************************************\
@ -97,73 +98,65 @@ struct MorphNoVec
    int operator()(uchar**, int, uchar*, int) const { return 0; }
 };

-#if CV_SSE2
+#if CV_SIMD

-template<class VecUpdate> struct MorphRowIVec
+template<class VecUpdate> struct MorphRowVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphRowIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
    int operator()(const uchar* src, uchar* dst, int width, int cn) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        cn *= ESZ;
        int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
+        width *= cn;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 16; i += 16 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
        {
-            __m128i s = _mm_loadu_si128((const __m128i*)(src + i));
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
+            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
            for (k = cn; k < _ksize; k += cn)
            {
-                __m128i x = _mm_loadu_si128((const __m128i*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
            }
-            _mm_storeu_si128((__m128i*)(dst + i), s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
+            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
        }
-
-        for( ; i < width; i += 4 )
+        if( i <= width - 2*vtype::nlanes )
        {
-            __m128i s = _mm_cvtsi32_si128(*(const int*)(src + i));
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
            for( k = cn; k < _ksize; k += cn )
            {
-                __m128i x = _mm_cvtsi32_si128(*(const int*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
            }
-            *(int*)(dst + i) = _mm_cvtsi128_si32(s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
        }
-
-        return i/ESZ;
-    }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphRowFVec
-{
-    MorphRowFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar* src, uchar* dst, int width, int cn) const
+        if( i <= width - vtype::nlanes )
        {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
-        VecUpdate updateOp;
-
-        for( i = 0; i < width; i += 4 )
-        {
-            __m128 s = _mm_loadu_ps((const float*)src + i);
+            vtype s = vx_load((const stype*)src + i);
            for( k = cn; k < _ksize; k += cn )
-            {
-                __m128 x = _mm_loadu_ps((const float*)src + i + k);
-                s = updateOp(s, x);
+                s = updateOp(s, vx_load((const stype*)src + i + k));
+            v_store((stype*)dst + i, s);
+            i += vtype::nlanes;
        }
-            _mm_storeu_ps((float*)dst + i, s);
+        if( i <= width - vtype::nlanes/2 )
+        {
+            vtype s = vx_load_low((const stype*)src + i);
+            for( k = cn; k < _ksize; k += cn )
+                s = updateOp(s, vx_load_low((const stype*)src + i + k));
+            v_store_low((stype*)dst + i, s);
+            i += vtype::nlanes/2;
        }

        return i;
@ -173,419 +166,269 @@ template<class VecUpdate> struct MorphRowFVec
 };


-template<class VecUpdate> struct MorphColumnIVec
+template<class VecUpdate> struct MorphColumnVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphColumnIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** src, uchar* dst, int dststep, int count, int width) const
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
        int i = 0, k, _ksize = ksize;
-        width *= ESZ;
        VecUpdate updateOp;

        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)src[i] & 15) == 0 );
+            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
+        dststep /= sizeof(dst[0]);

        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
        {
-            for( i = 0; i <= width - 32; i += 32 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
            {
-                const uchar* sptr = src[1] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);

                for( k = 2; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                }

                sptr = src[0] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + i + 16), updateOp(s1, x1));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));

                sptr = src[k] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i + 16), updateOp(s1, x1));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
            }
-
-            for( ; i <= width - 8; i += 8 )
+            if( i <= width - 2*vtype::nlanes )
            {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[1] + i)), x0;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);

                for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-
-                x0 = _mm_loadl_epi64((const __m128i*)(src[0] + i));
-                _mm_storel_epi64((__m128i*)(dst + i), updateOp(s0, x0));
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                _mm_storel_epi64((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-            }
-        }
-
-        for( ; count > 0; count--, dst += dststep, src++ )
-        {
-            for( i = 0; i <= width - 32; i += 32 )
-            {
-                const uchar* sptr = src[0] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
-
-                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                }
-                _mm_storeu_si128((__m128i*)(dst + i), s0);
-                _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                }

-            for( ; i <= width - 8; i += 8 )
-            {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
-
-                for( k = 1; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-                _mm_storel_epi64((__m128i*)(dst + i), s0);
-            }
-        }
+                sptr = src[0] + i;
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));

-        return i/ESZ;
+                sptr = src[k] + i;
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                i += 2*vtype::nlanes;
            }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphColumnFVec
+            if( i <= width - vtype::nlanes )
            {
-    MorphColumnFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i = 0, k, _ksize = ksize;
-        VecUpdate updateOp;
-
-        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & 15) == 0 );
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
-        dststep /= sizeof(dst[0]);
-
-        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
-        {
-            for( i = 0; i <= width - 16; i += 16 )
-            {
-                const float* sptr = src[1] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                vtype s0 = vx_load_aligned(src[1] + i);

                for( k = 2; k < _ksize; k++ )
-                {
-                    sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
-                }
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));

-                sptr = src[0] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + i + 12, updateOp(s3, x3));
-
-                sptr = src[k] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + dststep + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + dststep + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + dststep + i + 12, updateOp(s3, x3));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
+                i += vtype::nlanes;
            }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - vtype::nlanes/2 )
            {
-                __m128 s0 = _mm_load_ps(src[1] + i), x0;
+                vtype s0 = vx_load_low(src[1] + i);

                for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
-                }
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));

-                x0 = _mm_load_ps(src[0] + i);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                x0 = _mm_load_ps(src[k] + i);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
+                v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
+                v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
+                i += vtype::nlanes/2;
            }
        }

        for( ; count > 0; count--, dst += dststep, src++ )
        {
-            for( i = 0; i <= width - 16; i += 16 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
            {
-                const float* sptr = src[0] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);

                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                }
-                _mm_storeu_ps(dst + i, s0);
-                _mm_storeu_ps(dst + i + 4, s1);
-                _mm_storeu_ps(dst + i + 8, s2);
-                _mm_storeu_ps(dst + i + 12, s3);
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                v_store(dst + i + 2*vtype::nlanes, s2);
+                v_store(dst + i + 3*vtype::nlanes, s3);
            }
-
-            for( i = 0; i <= width - 4; i += 4 )
+            if( i <= width - 2*vtype::nlanes )
            {
-                __m128 s0 = _mm_load_ps(src[0] + i), x0;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+
                for( k = 1; k < _ksize; k++ )
                {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
-                }
-                _mm_storeu_ps(dst + i, s0);
-            }
+                    sptr = src[k] + i;
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                }
-
-        return i;
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                i += 2*vtype::nlanes;
            }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphIVec
-{
-    enum { ESZ = VecUpdate::ESZ };
-
-    int operator()(uchar** src, int nz, uchar* dst, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        int i, k;
-        width *= ESZ;
-        VecUpdate updateOp;
-
-        for( i = 0; i <= width - 32; i += 32 )
+            if( i <= width - vtype::nlanes )
            {
-            const uchar* sptr = src[0] + i;
-            __m128i s0 = _mm_loadu_si128((const __m128i*)sptr);
-            __m128i s1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-            __m128i x0, x1;
+                vtype s0 = vx_load_aligned(src[0] + i);

-            for( k = 1; k < nz; k++ )
-            {
-                sptr = src[k] + i;
-                x0 = _mm_loadu_si128((const __m128i*)sptr);
-                x1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-            }
-            _mm_storeu_si128((__m128i*)(dst + i), s0);
-            _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));
+                v_store(dst + i, s0);
+                i += vtype::nlanes;
            }
-
-        for( ; i <= width - 8; i += 8 )
+            if( i <= width - vtype::nlanes/2 )
            {
-            __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
+                vtype s0 = vx_load_low(src[0] + i);

-            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                s0 = updateOp(s0, x0);
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));
+                v_store_low(dst + i, s0);
+                i += vtype::nlanes/2;
            }
-            _mm_storel_epi64((__m128i*)(dst + i), s0);
        }

-        return i/ESZ;
+        return i;
    }
+
+    int ksize, anchor;
 };


-template<class VecUpdate> struct MorphFVec
+template<class VecUpdate> struct MorphVec
 {
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
        int i, k;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 16; i += 16 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
        {
-            const float* sptr = src[0] + i;
-            __m128 s0 = _mm_loadu_ps(sptr);
-            __m128 s1 = _mm_loadu_ps(sptr + 4);
-            __m128 s2 = _mm_loadu_ps(sptr + 8);
-            __m128 s3 = _mm_loadu_ps(sptr + 12);
-            __m128 x0, x1, x2, x3;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
+            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
            for( k = 1; k < nz; k++ )
            {
                sptr = src[k] + i;
-                x0 = _mm_loadu_ps(sptr);
-                x1 = _mm_loadu_ps(sptr + 4);
-                x2 = _mm_loadu_ps(sptr + 8);
-                x3 = _mm_loadu_ps(sptr + 12);
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-                s2 = updateOp(s2, x2);
-                s3 = updateOp(s3, x3);
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
            }
-            _mm_storeu_ps(dst + i, s0);
-            _mm_storeu_ps(dst + i + 4, s1);
-            _mm_storeu_ps(dst + i + 8, s2);
-            _mm_storeu_ps(dst + i + 12, s3);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            v_store(dst + i + 2*vtype::nlanes, s2);
+            v_store(dst + i + 3*vtype::nlanes, s3);
        }
-
-        for( ; i <= width - 4; i += 4 )
+        if( i <= width - 2*vtype::nlanes )
        {
-            __m128 s0 = _mm_loadu_ps(src[0] + i), x0;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
            for( k = 1; k < nz; k++ )
            {
-                x0 = _mm_loadu_ps(src[k] + i);
-                s0 = updateOp(s0, x0);
+                sptr = src[k] + i;
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
            }
-            _mm_storeu_ps(dst + i, s0);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
        }
-
-        for( ; i < width; i++ )
+        if( i <= width - vtype::nlanes )
        {
-            __m128 s0 = _mm_load_ss(src[0] + i), x0;
-
+            vtype s0 = vx_load(src[0] + i);
            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_load_ss(src[k] + i);
-                s0 = updateOp(s0, x0);
+                s0 = updateOp(s0, vx_load(src[k] + i));
+            v_store(dst + i, s0);
+            i += vtype::nlanes;
        }
-            _mm_store_ss(dst + i, s0);
+        if( i <= width - vtype::nlanes/2 )
+        {
+            vtype s0 = vx_load_low(src[0] + i);
+            for( k = 1; k < nz; k++ )
+                s0 = updateOp(s0, vx_load_low(src[k] + i));
+            v_store_low(dst + i, s0);
+            i += vtype::nlanes/2;
        }
-
        return i;
    }
 };

-struct VMin8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }
-};
-struct VMax8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }
-};
-struct VMin16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); }
-};
-struct VMax16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_adds_epu16(_mm_subs_epu16(a,b), b); }
-};
-struct VMin16s
+template <typename T> struct VMin
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_min_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); }
 };
-struct VMax16s
+template <typename T> struct VMax
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_max_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); }
 };
-struct VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }};
-struct VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }};
-
-typedef MorphRowIVec<VMin8u> ErodeRowVec8u;
-typedef MorphRowIVec<VMax8u> DilateRowVec8u;
-typedef MorphRowIVec<VMin16u> ErodeRowVec16u;
-typedef MorphRowIVec<VMax16u> DilateRowVec16u;
-typedef MorphRowIVec<VMin16s> ErodeRowVec16s;
-typedef MorphRowIVec<VMax16s> DilateRowVec16s;
-typedef MorphRowFVec<VMin32f> ErodeRowVec32f;
-typedef MorphRowFVec<VMax32f> DilateRowVec32f;
-
-typedef MorphColumnIVec<VMin8u> ErodeColumnVec8u;
-typedef MorphColumnIVec<VMax8u> DilateColumnVec8u;
-typedef MorphColumnIVec<VMin16u> ErodeColumnVec16u;
-typedef MorphColumnIVec<VMax16u> DilateColumnVec16u;
-typedef MorphColumnIVec<VMin16s> ErodeColumnVec16s;
-typedef MorphColumnIVec<VMax16s> DilateColumnVec16s;
-typedef MorphColumnFVec<VMin32f> ErodeColumnVec32f;
-typedef MorphColumnFVec<VMax32f> DilateColumnVec32f;
-
-typedef MorphIVec<VMin8u> ErodeVec8u;
-typedef MorphIVec<VMax8u> DilateVec8u;
-typedef MorphIVec<VMin16u> ErodeVec16u;
-typedef MorphIVec<VMax16u> DilateVec16u;
-typedef MorphIVec<VMin16s> ErodeVec16s;
-typedef MorphIVec<VMax16s> DilateVec16s;
-typedef MorphFVec<VMin32f> ErodeVec32f;
-typedef MorphFVec<VMax32f> DilateVec32f;
+
+typedef MorphRowVec<VMin<v_uint8> > ErodeRowVec8u;
+typedef MorphRowVec<VMax<v_uint8> > DilateRowVec8u;
+typedef MorphRowVec<VMin<v_uint16> > ErodeRowVec16u;
+typedef MorphRowVec<VMax<v_uint16> > DilateRowVec16u;
+typedef MorphRowVec<VMin<v_int16> > ErodeRowVec16s;
+typedef MorphRowVec<VMax<v_int16> > DilateRowVec16s;
+typedef MorphRowVec<VMin<v_float32> > ErodeRowVec32f;
+typedef MorphRowVec<VMax<v_float32> > DilateRowVec32f;
+
+typedef MorphColumnVec<VMin<v_uint8> > ErodeColumnVec8u;
+typedef MorphColumnVec<VMax<v_uint8> > DilateColumnVec8u;
+typedef MorphColumnVec<VMin<v_uint16> > ErodeColumnVec16u;
+typedef MorphColumnVec<VMax<v_uint16> > DilateColumnVec16u;
+typedef MorphColumnVec<VMin<v_int16> > ErodeColumnVec16s;
+typedef MorphColumnVec<VMax<v_int16> > DilateColumnVec16s;
+typedef MorphColumnVec<VMin<v_float32> > ErodeColumnVec32f;
+typedef MorphColumnVec<VMax<v_float32> > DilateColumnVec32f;
+
+typedef MorphVec<VMin<v_uint8> > ErodeVec8u;
+typedef MorphVec<VMax<v_uint8> > DilateVec8u;
+typedef MorphVec<VMin<v_uint16> > ErodeVec16u;
+typedef MorphVec<VMax<v_uint16> > DilateVec16u;
+typedef MorphVec<VMin<v_int16> > ErodeVec16s;
+typedef MorphVec<VMax<v_int16> > DilateVec16s;
+typedef MorphVec<VMin<v_float32> > ErodeVec32f;
+typedef MorphVec<VMax<v_float32> > DilateVec32f;

 #else

--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@ -39,6 +39,8 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
 namespace cv
 {

@ -746,53 +748,105 @@ static Rect pointSetBoundingRect( const Mat& points )
    if( npoints == 0 )
        return Rect();

-    const Point* pts = points.ptr<Point>();
-    Point pt = pts[0];
+#if CV_SIMD
+    const int64_t* pts = points.ptr<int64_t>();

-#if CV_SSE4_2
-    if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
-    {
    if( !is_float )
    {
-            __m128i minval, maxval;
-            minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y
-
-            for( i = 1; i < npoints; i++ )
+        v_int32 minval, maxval;
+        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
        {
-                __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]);
-                minval = _mm_min_epi32(ptXY, minval);
-                maxval = _mm_max_epi32(ptXY, maxval);
+            v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
        }
-            xmin = _mm_cvtsi128_si32(minval);
-            ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
-            xmax = _mm_cvtsi128_si32(maxval);
-            ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
+        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - v_int32::nlanes/4 )
+        {
+            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += v_int64::nlanes/2;
        }
-        else
+        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
        {
-            __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
-            minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));
-
-            for( i = 1; i < npoints; i++ )
+            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = minval.get0();
+        xmax = maxval.get0();
+        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
+        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
        {
-                ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]);
-
-                minvalf = _mm_min_ps(minvalf, ptXY);
-                maxvalf = _mm_max_ps(maxvalf, ptXY);
+            v_int32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+            for( i++; i < npoints; i++ )
+            {
+                v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
            }
-
-            float xyminf[2], xymaxf[2];
-            _mm_storel_pi((__m64*)xyminf, minvalf);
-            _mm_storel_pi((__m64*)xymaxf, maxvalf);
-            xmin = cvFloor(xyminf[0]);
-            ymin = cvFloor(xyminf[1]);
-            xmax = cvFloor(xymaxf[0]);
-            ymax = cvFloor(xymaxf[1]);
+            xmin = min(xmin, minval2.get0());
+            xmax = max(xmax, maxval2.get0());
+            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
+            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
        }
+#endif
    }
    else
-#endif
    {
+        v_float32 minval, maxval;
+        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
+        {
+            v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
+        }
+        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - v_float32::nlanes/4 )
+        {
+            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += v_float32::nlanes/4;
+        }
+        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        {
+            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = cvFloor(minval.get0());
+        xmax = cvFloor(maxval.get0());
+        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
+        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
+        {
+            v_float32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+            for( i++; i < npoints; i++ )
+            {
+                v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
+            }
+            xmin = min(xmin, cvFloor(minval2.get0()));
+            xmax = max(xmax, cvFloor(maxval2.get0()));
+            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
+            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
+        }
+#endif
+    }
+#else
+    const Point* pts = points.ptr<Point>();
+    Point pt = pts[0];
+
    if( !is_float )
    {
        xmin = xmax = pt.x;
@ -848,7 +902,7 @@ static Rect pointSetBoundingRect( const Mat& points )
        v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
        v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
    }
-    }
+#endif

    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
 }
--- a/modules/imgproc/src/sumpixels.avx512_skx.cpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp
@ -0,0 +1,262 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#include "precomp.hpp"
+#include "sumpixels.hpp"
+
+namespace cv {
+namespace { // Anonymous namespace to avoid exposing the implementation classes
+
+//
+// NOTE: Look at the bottom of the file for the entry-point function for external callers
+//
+
+// At the moment only 3 channel support untilted is supported
+// More channel support coming soon.
+// TODO: Add support for sqsum and 1,2, and 4 channels
+class IntegralCalculator_3Channel {
+public:
+    IntegralCalculator_3Channel() {};
+
+
+    void calculate_integral_avx512(const uchar *src, size_t _srcstep,
+                                   double *sum,      size_t _sumstep,
+                                   double *sqsum,    size_t _sqsumstep,
+                                   int width, int height, int cn)
+    {
+        const int srcstep = (int)(_srcstep/sizeof(uchar));
+        const int sumstep = (int)(_sumstep/sizeof(double));
+        const int sqsumstep = (int)(_sqsumstep/sizeof(double));
+        const int ops_per_line = width * cn;
+
+        // Clear the first line of the sum as per spec (see integral documentation)
+        // Also adjust the index of sum and sqsum to be at the real 0th element
+        // and not point to the border pixel so it stays in sync with the src pointer
+        memset( sum, 0, (ops_per_line+cn)*sizeof(double));
+        sum += cn;
+
+        if (sqsum) {
+            memset( sqsum, 0, (ops_per_line+cn)*sizeof(double));
+            sqsum += cn;
+        }
+
+        // Now calculate the integral over the whole image one line at a time
+        for(int y = 0; y < height; y++) {
+            const uchar * src_line    = &src[y*srcstep];
+            double      * sum_above   = &sum[y*sumstep];
+            double      * sum_line    = &sum_above[sumstep];
+            double      * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep]     : NULL;
+            double      * sqsum_line  = (sqsum) ? &sqsum_above[sqsumstep] : NULL;
+
+            integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);
+
+        }
+    }
+
+    static inline
+    void integral_line_3channel_avx512(const uchar *srcs,
+                                       double *sums,   double *sums_above,
+                                       double *sqsums, double *sqsums_above,
+                                       int num_ops_in_line)
+    {
+        __m512i sum_accumulator   = _mm512_setzero_si512();  // holds rolling sums for the line
+        __m512i sqsum_accumulator = _mm512_setzero_si512();  // holds rolling sqsums for the line
+
+        // The first element on each line must be zeroes as per spec (see integral documentation)
+        set_border_pixel_value(sums, sqsums);
+
+        // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
+        aligned_integral(     srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+        post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+
+    }
+
+
+    static inline
+    void set_border_pixel_value(double *sums, double *sqsums)
+    {
+        // Sets the border pixel value to 0s.
+        // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now
+        __m512i zeroes = _mm512_setzero_si512();
+
+        _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes);
+        if (sqsums)
+            _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes);
+    }
+
+
+    static inline
+    void aligned_integral(const uchar *&srcs,
+                          double *&sums,  double *&sums_above,
+                          double *&sqsum, double *&sqsum_above,
+                          __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                          int num_ops_in_line)
+    {
+        // This function handles full 64 byte chunks of the source data at a time until it gets to the part of
+        // the line that no longer contains a full 64 byte chunk.  Other code will handle the last part.
+
+        const int num_chunks = num_ops_in_line >> 6;  // quick int divide by 64
+
+        for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){
+            integral_64_operations_avx512((__m512i *) srcs,
+                                          (__m512i *) sums,  (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator);
+            srcs+=64; sums+=64; sums_above+=64;
+            if (sqsum){ sqsum+= 64; sqsum_above+=64; }
+        }
+    }
+
+
+    static inline
+    void post_aligned_integral(const uchar *srcs,
+                               const double *sums,   const double *sums_above,
+                               const double *sqsum,  const double *sqsum_above,
+                               __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                               int num_ops_in_line)
+    {
+        // This function handles the last few straggling operations that are not a full chunk of 64 operations
+        // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).
+
+        const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F;  // Quick int modulo 64
+
+        if (num_operations > 0) {
+            __mmask64 operation_mask = (1ULL << num_operations) - 1ULL;
+
+            integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          operation_mask, sum_accumulator, sqsum_accumulator);
+        }
+    }
+
+
+    static inline
+    void integral_64_operations_avx512(const __m512i *srcs,
+                                       __m512i *sums,       const __m512i *sums_above,
+                                       __m512i *sqsums,     const __m512i *sqsums_above,
+                                       __mmask64 data_mask,
+                                       __m512i &sum_accumulator, __m512i &sqsum_accumulator)
+    {
+        __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);
+
+        for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) {
+            __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data
+
+            for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) {
+
+                __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes);
+
+                // Calculate integral for the sum on the 8 entries
+                integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator);
+                sums++; sums_above++;
+
+                if (sqsums){ // Calculate integral for the sum on the 8 entries
+                    __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs);
+
+                    integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator);
+                    sqsums++; sqsums_above++;
+                }
+
+                // Prepare for next iteration of inner loop
+                // shift source to align next 8 bytes to lane 0 and shift the mask
+                src_16bytes = shift_right_8_bytes(src_16bytes);
+                data_mask = data_mask >> 8;
+
+            }
+
+            // Prepare for next iteration of outer loop
+            src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);
+        }
+    }
+
+
+    static inline
+    void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
+                               __m512i *results_ptr, __m512i &accumulator)
+     {
+        _mm512_mask_storeu_pd(
+                results_ptr,   // Store the result here
+                data_mask,     // Using the data mask to avoid overrunning the line
+                calculate_integral( // Writing the value of the integral derived from:
+                        src_longs,                                           // input data
+                        _mm512_maskz_loadu_pd(data_mask, above_values_ptr),  // and the results from line above
+                        accumulator                                          // keeping track of the accumulator
+                )
+        );
+    }
+
+
+    static inline
+    __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+    {
+        __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
+
+        // Align data to prepare for the adds:
+        //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+        //   Vertical LANES:     76543210
+        //   src_longs       :   HGFEDCBA
+        //   shited3lanes    : + EDCBA
+        //   shifted6lanes   : + BA
+        //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
+        //                     = integral
+        __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
+        __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
+        __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+        // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator)
+        __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes);
+        __m512i sum_src_carry  = _mm512_add_epi64(src_longs, carry_over);
+        accumulator            = _mm512_add_epi64(sum_shift3and6, sum_src_carry);
+
+        // Convert to packed double and add to the line above to get the true integral value
+        __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+        __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+        return integral_pd;
+    }
+
+
+    static inline
+    __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask)  {
+        return _mm512_maskz_loadu_epi8(data_mask, srcs);
+    }
+
+
+    static inline
+    __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes)  {
+        return _mm512_cvtepu8_epi64(src_16bytes);
+    }
+
+
+    static inline
+    __m128i shift_right_8_bytes(__m128i src_16bytes)  {
+        return _mm_maskz_compress_epi64(2, src_16bytes);
+    }
+
+
+    static inline
+    __m512i shift_right_16_bytes(__m512i src_64byte_chunk)  {
+        return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
+    }
+
+};
+} // end of anonymous namespace
+
+namespace opt_AVX512_SKX {
+
+// This is the implementation for the external callers interface entry point.
+// It should be the only function called into this file from outside
+// Any new implementations should be directed from here
+void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
+                               double      *sum,   size_t _sumstep,
+                               double      *sqsum, size_t _sqsumstep,
+                               int width, int height, int cn)
+{
+    IntegralCalculator_3Channel  calculator;
+    calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn);
+}
+
+
+} // end namespace opt_AVX512_SXK
+} // end namespace cv
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@ -10,7 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
@ -44,6 +44,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "sumpixels.hpp"


 namespace cv
@ -62,6 +63,37 @@ struct Integral_SIMD
    }
 };

+
+template <>
+struct Integral_SIMD<uchar, double, double> {
+    Integral_SIMD() {};
+
+
+    bool operator()(const uchar *src, size_t _srcstep,
+                    double *sum,      size_t _sumstep,
+                    double *sqsum,    size_t _sqsumstep,
+                    double *tilted,   size_t _tiltedstep,
+                    int width, int height, int cn) const
+    {
+#if CV_TRY_AVX512_SKX
+        CV_UNUSED(_tiltedstep);
+        // TODO: Add support for 1,2, and 4 channels
+        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){
+            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
+                                                      sqsum, _sqsumstep, width, height, cn);
+            return true;
+        }
+#else
+        // Avoid warnings in some builds
+        CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep);
+        CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep);
+        CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn);
+#endif
+        return false;
+    }
+
+};
+
 #if CV_SIMD && CV_SIMD_WIDTH <= 64

 template <>
--- a/modules/imgproc/src/sumpixels.hpp
+++ b/modules/imgproc/src/sumpixels.hpp
@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
+#define OPENCV_IMGPROC_SUM_PIXELS_HPP
+
+namespace cv
+{
+
+namespace opt_AVX512_SKX
+{
+#if CV_TRY_AVX512_SKX
+    void calculate_integral_avx512(
+            const uchar *src, size_t _srcstep,
+            double *sum,      size_t _sumstep,
+            double *sqsum,    size_t _sqsumstep,
+            int width, int height, int cn);
+
+#endif
+} // end namespace opt_AVX512_SKX
+} // end namespace cv
+
+#endif
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@ -190,22 +190,19 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    int j = 0;
    const uchar* src = _src.ptr();
    uchar* dst = _dst.ptr();
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
-        v_uint8x16 thresh_u = v_setall_u8( thresh );
-        v_uint8x16 maxval16 = v_setall_u8( maxval );
+#if CV_SIMD
+    v_uint8 thresh_u = vx_setall_u8( thresh );
+    v_uint8 maxval16 = vx_setall_u8( maxval );

    switch( type )
    {
    case THRESH_BINARY:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
+                v_uint8 v0;
+                v0 = vx_load( src + j );
                v0 = thresh_u < v0;
                v0 = v0 & maxval16;
                v_store( dst + j, v0 );
@ -216,10 +213,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_BINARY_INV:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
+                v_uint8 v0;
+                v0 = vx_load( src + j );
                v0 = v0 <= thresh_u;
                v0 = v0 & maxval16;
                v_store( dst + j, v0 );
@ -230,10 +227,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TRUNC:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
+                v_uint8 v0;
+                v0 = vx_load( src + j );
                v0 = v0 - ( v0 - thresh_u );
                v_store( dst + j, v0 );
            }
@ -243,10 +240,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TOZERO:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
+                v_uint8 v0;
+                v0 = vx_load( src + j );
                v0 = ( thresh_u < v0 ) & v0;
                v_store( dst + j, v0 );
            }
@ -256,17 +253,16 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TOZERO_INV:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
+                v_uint8 v0;
+                v0 = vx_load( src + j );
                v0 = ( v0 <= thresh_u ) & v0;
                v_store( dst + j, v0 );
            }
        }
        break;
    }
-    }
 #endif

    int j_scalar = j;
@ -355,30 +351,35 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)

    const ushort* src = _src.ptr<ushort>();
    ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
-    if (useSIMD)
-    {
+#if CV_SIMD
    int i, j;
-        v_uint16x8 thresh_u = v_setall_u16(thresh);
-        v_uint16x8 maxval16 = v_setall_u16(maxval);
+    v_uint16 thresh_u = vx_setall_u16(thresh);
+    v_uint16 maxval16 = vx_setall_u16(maxval);

    switch (type)
    {
    case THRESH_BINARY:
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
-                for (j = 0; j <= roi.width - 16; j += 16)
+            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
                v0 = thresh_u < v0;
                v1 = thresh_u < v1;
                v0 = v0 & maxval16;
                v1 = v1 & maxval16;
                v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = thresh_u < v0;
+                v0 = v0 & maxval16;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }

            for (; j < roi.width; j++)
@ -390,17 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-                for (; j <= roi.width - 16; j += 16)
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
                v0 = v0 <= thresh_u;
                v1 = v1 <= thresh_u;
                v0 = v0 & maxval16;
                v1 = v1 & maxval16;
                v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = v0 <= thresh_u;
+                v0 = v0 & maxval16;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }

            for (; j < roi.width; j++)
@ -412,15 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-                for (; j <= roi.width - 16; j += 16)
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
                v0 = v_min(v0, thresh_u);
                v1 = v_min(v1, thresh_u);
                v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = v_min(v0, thresh_u);
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }

            for (; j < roi.width; j++)
@ -432,15 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-                for (; j <= roi.width - 16; j += 16)
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
                v0 = (thresh_u < v0) & v0;
                v1 = (thresh_u < v1) & v1;
                v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (thresh_u < v0) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }

            for (; j < roi.width; j++)
@ -452,15 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-                for (; j <= roi.width - 16; j += 16)
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
                v0 = (v0 <= thresh_u) & v0;
                v1 = (v1 <= thresh_u) & v1;
                v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (v0 <= thresh_u) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }

            for (; j < roi.width; j++)
@ -468,12 +498,9 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        }
        break;
    }
-    }
-    else
-#endif
-    {
+#else
    threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
+#endif
 }

 static void
@ -544,13 +571,10 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    }
 #endif

-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
+#if CV_SIMD
    int i, j;
-        v_int16x8 thresh8 = v_setall_s16( thresh );
-        v_int16x8 maxval8 = v_setall_s16( maxval );
+    v_int16 thresh8 = vx_setall_s16( thresh );
+    v_int16 maxval8 = vx_setall_s16( maxval );

    switch( type )
    {
@ -558,17 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
                v0 = thresh8 < v0;
                v1 = thresh8 < v1;
                v0 = v0 & maxval8;
                v1 = v1 & maxval8;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = thresh8 < v0;
+                v0 = v0 & maxval8;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -580,17 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
                v0 = v0 <= thresh8;
                v1 = v1 <= thresh8;
                v0 = v0 & maxval8;
                v1 = v1 & maxval8;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = v0 <= thresh8;
+                v0 = v0 & maxval8;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -602,15 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
                v0 = v_min( v0, thresh8 );
                v1 = v_min( v1, thresh8 );
                v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh8 );
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -622,15 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
                v0 = ( thresh8 < v0 ) & v0;
                v1 = ( thresh8 < v1 ) & v1;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( thresh8 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -642,15 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
                v0 = ( v0 <= thresh8 ) & v0;
                v1 = ( v1 <= thresh8 ) & v1;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh8 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -660,12 +721,9 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    default:
        CV_Error( CV_StsBadArg, "" ); return;
    }
-    }
-    else
-#endif
-    {
+#else
    threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
+#endif
 }


@ -719,13 +777,10 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
    }
 #endif

-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
+#if CV_SIMD
    int i, j;
-        v_float32x4 thresh4 = v_setall_f32( thresh );
-        v_float32x4 maxval4 = v_setall_f32( maxval );
+    v_float32 thresh4 = vx_setall_f32( thresh );
+    v_float32 maxval4 = vx_setall_f32( maxval );

    switch( type )
    {
@ -733,17 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
                    v0 = thresh4 < v0;
                    v1 = thresh4 < v1;
                    v0 = v0 & maxval4;
                    v1 = v1 & maxval4;
                    v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = thresh4 < v0;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }

                for( ; j < roi.width; j++ )
@ -755,17 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
                    v0 = v0 <= thresh4;
                    v1 = v1 <= thresh4;
                    v0 = v0 & maxval4;
                    v1 = v1 & maxval4;
                    v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = v0 <= thresh4;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }

                for( ; j < roi.width; j++ )
@ -777,15 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
                    v0 = v_min( v0, thresh4 );
                    v1 = v_min( v1, thresh4 );
                    v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = v_min( v0, thresh4 );
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }

                for( ; j < roi.width; j++ )
@ -797,15 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
                    v0 = ( thresh4 < v0 ) & v0;
                    v1 = ( thresh4 < v1 ) & v1;
                    v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = ( thresh4 < v0 ) & v0;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }

                for( ; j < roi.width; j++ )
@ -817,15 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
                    v0 = ( v0 <= thresh4 ) & v0;
                    v1 = ( v1 <= thresh4 ) & v1;
                    v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = ( v0 <= thresh4 ) & v0;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }

                for( ; j < roi.width; j++ )
@ -835,12 +927,9 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
        default:
            CV_Error( CV_StsBadArg, "" ); return;
    }
-    }
-    else
-#endif
-    {
+#else
    threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
+#endif
 }

 static void
@ -859,13 +948,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        roi.height = 1;
    }

-#if CV_SIMD128_64F
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
+#if CV_SIMD_64F
    int i, j;
-        v_float64x2 thresh2 = v_setall_f64( thresh );
-        v_float64x2 maxval2 = v_setall_f64( maxval );
+    v_float64 thresh2 = vx_setall_f64( thresh );
+    v_float64 maxval2 = vx_setall_f64( maxval );

    switch( type )
    {
@ -873,17 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
            {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
                v0 = thresh2 < v0;
                v1 = thresh2 < v1;
                v0 = v0 & maxval2;
                v1 = v1 & maxval2;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = thresh2 < v0;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -895,17 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
            {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
                v0 = v0 <= thresh2;
                v1 = v1 <= thresh2;
                v0 = v0 & maxval2;
                v1 = v1 & maxval2;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v0 <= thresh2;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -917,15 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
            {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
                v0 = v_min( v0, thresh2 );
                v1 = v_min( v1, thresh2 );
                v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh2 );
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -937,15 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
            {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
                v0 = ( thresh2 < v0 ) & v0;
                v1 = ( thresh2 < v1 ) & v1;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( thresh2 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -957,15 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
            {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
                v0 = ( v0 <= thresh2 ) & v0;
                v1 = ( v1 <= thresh2 ) & v1;
                v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh2 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
            }

            for( ; j < roi.width; j++ )
@ -975,12 +1098,9 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
    default:
        CV_Error(CV_StsBadArg, ""); return;
    }
-    }
-    else
-#endif
-    {
+#else
    threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
+#endif
 }

 #ifdef HAVE_IPP
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@ -3062,4 +3062,14 @@ TEST(ImgProc_BGR2RGBA, 3ch24ch)
    EXPECT_DOUBLE_EQ(cvtest::norm(expected - dst, NORM_INF), 0.);
 }

+TEST(ImgProc_RGB2YUV, regression_13668)
+{
+    Mat src(Size(32, 4), CV_8UC3, Scalar(9, 250,  82));  // Ensure that SIMD code path works
+    Mat dst;
+    cvtColor(src, dst, COLOR_RGB2YUV);
+    Vec3b res = dst.at<Vec3b>(0, 0);
+    Vec3b ref(159, 90, 0);
+    EXPECT_EQ(res, ref);
+}
+
 }} // namespace
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@ -387,7 +387,7 @@ bool QRDetect::computeTransformationPoints()
        findNonZero(mask_roi, non_zero_elem[i]);
        newHull.insert(newHull.end(), non_zero_elem[i].begin(), non_zero_elem[i].end());
    }
-    convexHull(Mat(newHull), locations);
+    convexHull(newHull, locations);
    for (size_t i = 0; i < locations.size(); i++)
    {
        for (size_t j = 0; j < 3; j++)
@ -556,7 +556,7 @@ vector<Point2f> QRDetect::getQuadrilateral(vector<Point2f> angle_list)
    }

    vector<Point> integer_hull;
-    convexHull(Mat(locations), integer_hull);
+    convexHull(locations, integer_hull);
    int hull_size = (int)integer_hull.size();
    vector<Point2f> hull(hull_size);
    for (int i = 0; i < hull_size; i++)
@ -901,7 +901,7 @@ bool QRDecode::versionDefinition()
    vector<Point> locations, non_zero_elem;
    Mat mask_roi = mask(Range(1, intermediate.rows - 1), Range(1, intermediate.cols - 1));
    findNonZero(mask_roi, non_zero_elem);
-    convexHull(Mat(non_zero_elem), locations);
+    convexHull(non_zero_elem, locations);
    Point offset = computeOffset(locations);

    Point temp_remote = locations[0], remote_point;
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@ -646,8 +646,6 @@ private:
        Size size = frame0.size();
        UMat prevFlowX, prevFlowY, curFlowX, curFlowY;

-        flowx.create(size, CV_32F);
-        flowy.create(size, CV_32F);
        UMat flowx0 = flowx;
        UMat flowy0 = flowy;

@ -1075,12 +1073,19 @@ private:
            return false;

        std::vector<UMat> flowar;
-        if (!_flow0.empty())
+
+        // If flag is set, check for integrity; if not set, allocate memory space
+        if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
+        {
+            if (_flow0.empty() || _flow0.size() != _prev0.size() || _flow0.channels() != 2 ||
+                _flow0.depth() != CV_32F)
+                return false;
            split(_flow0, flowar);
+        }
        else
        {
-            flowar.push_back(UMat());
-            flowar.push_back(UMat());
+            flowar.push_back(UMat(_prev0.size(), CV_32FC1));
+            flowar.push_back(UMat(_prev0.size(), CV_32FC1));
        }
        if(!this->operator()(_prev0.getUMat(), _next0.getUMat(), flowar[0], flowar[1])){
            return false;
@ -1112,7 +1117,14 @@ void FarnebackOpticalFlowImpl::calc(InputArray _prev0, InputArray _next0,

    CV_Assert( prev0.size() == next0.size() && prev0.channels() == next0.channels() &&
               prev0.channels() == 1 && pyrScale_ < 1 );
+
+    // If flag is set, check for integrity; if not set, allocate memory space
+    if( flags_ & OPTFLOW_USE_INITIAL_FLOW )
+        CV_Assert( _flow0.size() == prev0.size() && _flow0.channels() == 2 &&
+                   _flow0.depth() == CV_32F );
+    else
        _flow0.create( prev0.size(), CV_32FC2 );
+
    Mat flow0 = _flow0.getMat();

    for( k = 0, scale = 1; k < levels; k++ )
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@ -90,6 +90,18 @@ squeezenet:
  classes: "classification_classes_ILSVRC2012.txt"
  sample: "classification"

+# Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
+googlenet:
+  model: "bvlc_googlenet.caffemodel"
+  config: "bvlc_googlenet.prototxt"
+  mean: [104, 117, 123]
+  scale: 1.0
+  width: 224
+  height: 224
+  rgb: false
+  classes: "classification_classes_ILSVRC2012.txt"
+  sample: "classification"
+
 ################################################################################
 # Semantic segmentation models.
 ################################################################################
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@ -289,7 +289,7 @@ def removeUnusedNodesAndAttrs(to_remove, graph_def):
        op = graph_def.node[i].op
        name = graph_def.node[i].name

-        if op == 'Const' or to_remove(name, op):
+        if to_remove(name, op):
            if op != 'Const':
                removedNodes.append(name)

--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@ -48,10 +48,42 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):

    removeIdentity(graph_def)

+    nodesToKeep = []
    def to_remove(name, op):
-        return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+        if name in nodesToKeep:
+            return False
+        return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
               (name.startswith('CropAndResize') and op != 'CropAndResize')

+    # Fuse atrous convolutions (with dilations).
+    nodesMap = {node.name: node for node in graph_def.node}
+    for node in reversed(graph_def.node):
+        if node.op == 'BatchToSpaceND':
+            del node.input[2]
+            conv = nodesMap[node.input[0]]
+            spaceToBatchND = nodesMap[conv.input[0]]
+
+            # Extract paddings
+            stridedSlice = nodesMap[spaceToBatchND.input[2]]
+            assert(stridedSlice.op == 'StridedSlice')
+            pack = nodesMap[stridedSlice.input[0]]
+            assert(pack.op == 'Pack')
+
+            padNodeH = nodesMap[nodesMap[pack.input[0]].input[0]]
+            padNodeW = nodesMap[nodesMap[pack.input[1]].input[0]]
+            padH = int(padNodeH.attr['value']['tensor'][0]['int_val'][0])
+            padW = int(padNodeW.attr['value']['tensor'][0]['int_val'][0])
+
+            paddingsNode = NodeDef()
+            paddingsNode.name = conv.name + '/paddings'
+            paddingsNode.op = 'Const'
+            paddingsNode.addAttr('value', [padH, padH, padW, padW])
+            graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
+            nodesToKeep.append(paddingsNode.name)
+
+            spaceToBatchND.input[2] = paddingsNode.name
+
+
    removeUnusedNodesAndAttrs(to_remove, graph_def)


@ -225,6 +257,26 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
    detectionOut.addAttr('variance_encoded_in_target', True)
    graph_def.node.extend([detectionOut])

+    def getUnconnectedNodes():
+        unconnected = [node.name for node in graph_def.node]
+        for node in graph_def.node:
+            for inp in node.input:
+                if inp in unconnected:
+                    unconnected.remove(inp)
+        return unconnected
+
+    while True:
+        unconnectedNodes = getUnconnectedNodes()
+        unconnectedNodes.remove(detectionOut.name)
+        if not unconnectedNodes:
+            break
+
+        for name in unconnectedNodes:
+            for i in range(len(graph_def.node)):
+                if graph_def.node[i].name == name:
+                    del graph_def.node[i]
+                    break
+
    # Save as text.
    graph_def.save(outputPath)

--- a/samples/dnn/tf_text_graph_mask_rcnn.py
+++ b/samples/dnn/tf_text_graph_mask_rcnn.py
@ -55,7 +55,7 @@ graph_def = parseTextGraph(args.output)
 removeIdentity(graph_def)

 def to_remove(name, op):
-    return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+    return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
           (name.startswith('CropAndResize') and op != 'CropAndResize')

 removeUnusedNodesAndAttrs(to_remove, graph_def)
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@ -10,14 +10,60 @@
 # Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
 # See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
 import argparse
+import re
 from math import sqrt
 from tf_text_graph_common import *

+class SSDAnchorGenerator:
+    def __init__(self, min_scale, max_scale, num_layers, aspect_ratios,
+                 reduce_boxes_in_lowest_layer, image_width, image_height):
+        self.min_scale = min_scale
+        self.aspect_ratios = aspect_ratios
+        self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer
+        self.image_width = image_width
+        self.image_height = image_height
+        self.scales =  [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
+                            for i in range(num_layers)] + [1.0]
+
+    def get(self, layer_id):
+        if layer_id == 0 and self.reduce_boxes_in_lowest_layer:
+            widths = [0.1, self.min_scale * sqrt(2.0), self.min_scale * sqrt(0.5)]
+            heights = [0.1, self.min_scale / sqrt(2.0), self.min_scale / sqrt(0.5)]
+        else:
+            widths = [self.scales[layer_id] * sqrt(ar) for ar in self.aspect_ratios]
+            heights = [self.scales[layer_id] / sqrt(ar) for ar in self.aspect_ratios]
+
+            widths += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
+            heights += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
+        widths = [w * self.image_width for w in widths]
+        heights = [h * self.image_height for h in heights]
+        return widths, heights
+
+
+class MultiscaleAnchorGenerator:
+    def __init__(self, min_level, aspect_ratios, scales_per_octave, anchor_scale):
+        self.min_level = min_level
+        self.aspect_ratios = aspect_ratios
+        self.anchor_scale = anchor_scale
+        self.scales = [2**(float(s) / scales_per_octave) for s in range(scales_per_octave)]
+
+    def get(self, layer_id):
+        widths = []
+        heights = []
+        for a in self.aspect_ratios:
+            for s in self.scales:
+                base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
+                ar = sqrt(a)
+                heights.append(base_anchor_size * s / ar)
+                widths.append(base_anchor_size * s * ar)
+        return widths, heights
+
+
 def createSSDGraph(modelPath, configPath, outputPath):
    # Nodes that should be kept.
-    keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu6', 'Placeholder', 'FusedBatchNorm',
+    keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu', 'Relu6', 'Placeholder', 'FusedBatchNorm',
               'DepthwiseConv2dNative', 'ConcatV2', 'Mul', 'MaxPool', 'AvgPool', 'Identity',
-               'Sub']
+               'Sub', 'ResizeNearestNeighbor', 'Pad']

    # Node with which prefixes should be removed
    prefixesToRemove = ('MultipleGridAnchorGenerator/', 'Postprocessor/', 'Preprocessor/map')
@ -27,7 +73,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
    config = config['model'][0]['ssd'][0]
    num_classes = int(config['num_classes'][0])

-    ssd_anchor_generator = config['anchor_generator'][0]['ssd_anchor_generator'][0]
+    fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0]
+    image_width = int(fixed_shape_resizer['width'][0])
+    image_height = int(fixed_shape_resizer['height'][0])
+
+    box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional'
+
+    anchor_generator = config['anchor_generator'][0]
+    if 'ssd_anchor_generator' in anchor_generator:
+        ssd_anchor_generator = anchor_generator['ssd_anchor_generator'][0]
        min_scale = float(ssd_anchor_generator['min_scale'][0])
        max_scale = float(ssd_anchor_generator['max_scale'][0])
        num_layers = int(ssd_anchor_generator['num_layers'][0])
@ -35,18 +89,34 @@ def createSSDGraph(modelPath, configPath, outputPath):
        reduce_boxes_in_lowest_layer = True
        if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator:
            reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true'
+        priors_generator = SSDAnchorGenerator(min_scale, max_scale, num_layers,
+                                              aspect_ratios, reduce_boxes_in_lowest_layer,
+                                              image_width, image_height)

-    fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0]
-    image_width = int(fixed_shape_resizer['width'][0])
-    image_height = int(fixed_shape_resizer['height'][0])

-    box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional'
-
-    print('Number of classes: %d' % num_classes)
-    print('Number of layers: %d' % num_layers)
        print('Scale: [%f-%f]' % (min_scale, max_scale))
        print('Aspect ratios: %s' % str(aspect_ratios))
        print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer))
+    elif 'multiscale_anchor_generator' in anchor_generator:
+        multiscale_anchor_generator = anchor_generator['multiscale_anchor_generator'][0]
+        min_level = int(multiscale_anchor_generator['min_level'][0])
+        max_level = int(multiscale_anchor_generator['max_level'][0])
+        anchor_scale = float(multiscale_anchor_generator['anchor_scale'][0])
+        aspect_ratios = [float(ar) for ar in multiscale_anchor_generator['aspect_ratios']]
+        scales_per_octave = int(multiscale_anchor_generator['scales_per_octave'][0])
+        num_layers = max_level - min_level + 1
+        priors_generator = MultiscaleAnchorGenerator(min_level, aspect_ratios,
+                                                     scales_per_octave, anchor_scale)
+        print('Levels: [%d-%d]' % (min_level, max_level))
+        print('Anchor scale: %f' % anchor_scale)
+        print('Scales per octave: %d' % scales_per_octave)
+        print('Aspect ratios: %s' % str(aspect_ratios))
+    else:
+        print('Unknown anchor_generator')
+        exit(0)
+
+    print('Number of classes: %d' % num_classes)
+    print('Number of layers: %d' % num_layers)
    print('box predictor: %s' % box_predictor)
    print('Input image size: %dx%d' % (image_width, image_height))

@ -67,8 +137,8 @@ def createSSDGraph(modelPath, configPath, outputPath):
        return unconnected


+    def fuse_nodes(nodesToKeep):
        # Detect unfused batch normalization nodes and fuse them.
-    def fuse_batch_normalization():
        # Add_0 <-- moving_variance, add_y
        # Rsqrt <-- Add_0
        # Mul_0 <-- Rsqrt, gamma
@ -77,9 +147,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
        # Sub_0 <-- beta, Mul_2
        # Add_1 <-- Mul_1, Sub_0
        nodesMap = {node.name: node for node in graph_def.node}
-        subgraph = ['Add',
+        subgraphBatchNorm = ['Add',
            ['Mul', 'input', ['Mul', ['Rsqrt', ['Add', 'moving_variance', 'add_y']], 'gamma']],
            ['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
+        # Detect unfused nearest neighbor resize.
+        subgraphResizeNN = ['Reshape',
+            ['Mul', ['Reshape', 'input', ['Pack', 'shape_1', 'shape_2', 'shape_3', 'shape_4', 'shape_5']],
+                    'ones'],
+            ['Pack', ['StridedSlice', ['Shape', 'input'], 'stack', 'stack_1', 'stack_2'],
+                     'out_height', 'out_width', 'out_channels']]
        def checkSubgraph(node, targetNode, inputs, fusedNodes):
            op = targetNode[0]
            if node.op == op and (len(node.input) >= len(targetNode) - 1):
@ -100,7 +176,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
        for node in graph_def.node:
            inputs = {}
            fusedNodes = []
-            if checkSubgraph(node, subgraph, inputs, fusedNodes):
+            if checkSubgraph(node, subgraphBatchNorm, inputs, fusedNodes):
                name = node.name
                node.Clear()
                node.name = name
@ -112,15 +188,41 @@ def createSSDGraph(modelPath, configPath, outputPath):
                node.input.append(inputs['moving_variance'])
                node.addAttr('epsilon', 0.001)
                nodesToRemove += fusedNodes[1:]
+
+            inputs = {}
+            fusedNodes = []
+            if checkSubgraph(node, subgraphResizeNN, inputs, fusedNodes):
+                name = node.name
+                node.Clear()
+                node.name = name
+                node.op = 'ResizeNearestNeighbor'
+                node.input.append(inputs['input'])
+                node.input.append(name + '/output_shape')
+
+                out_height_node = nodesMap[inputs['out_height']]
+                out_width_node = nodesMap[inputs['out_width']]
+                out_height = int(out_height_node.attr['value']['tensor'][0]['int_val'][0])
+                out_width = int(out_width_node.attr['value']['tensor'][0]['int_val'][0])
+
+                shapeNode = NodeDef()
+                shapeNode.name = name + '/output_shape'
+                shapeNode.op = 'Const'
+                shapeNode.addAttr('value', [out_height, out_width])
+                graph_def.node.insert(graph_def.node.index(node), shapeNode)
+                nodesToKeep.append(shapeNode.name)
+
+                nodesToRemove += fusedNodes[1:]
        for node in nodesToRemove:
            graph_def.node.remove(node)

-    fuse_batch_normalization()
+    nodesToKeep = []
+    fuse_nodes(nodesToKeep)

    removeIdentity(graph_def)

    def to_remove(name, op):
-        return (not op in keepOps) or name.startswith(prefixesToRemove)
+        return (not name in nodesToKeep) and \
+               (op == 'Const' or (not op in keepOps) or name.startswith(prefixesToRemove))

    removeUnusedNodesAndAttrs(to_remove, graph_def)

@ -169,19 +271,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
            graph_def.node.extend([flatten])
        addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten')

-    idx = 0
+    num_matched_layers = 0
    for node in graph_def.node:
-        if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx) or \
-           node.name == ('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/Conv2D' % idx) or \
-           node.name == 'WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D':
+        if re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
+           re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name):
            node.addAttr('loc_pred_transposed', True)
-            idx += 1
-    assert(idx == num_layers)
+            num_matched_layers += 1
+    assert(num_matched_layers == num_layers)

    # Add layers that generate anchors (bounding boxes proposals).
-    scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
-              for i in range(num_layers)] + [1.0]
-
    priorBoxes = []
    for i in range(num_layers):
        priorBox = NodeDef()
@ -199,17 +297,8 @@ def createSSDGraph(modelPath, configPath, outputPath):
        priorBox.addAttr('flip', False)
        priorBox.addAttr('clip', False)

-        if i == 0 and reduce_boxes_in_lowest_layer:
-            widths = [0.1, min_scale * sqrt(2.0), min_scale * sqrt(0.5)]
-            heights = [0.1, min_scale / sqrt(2.0), min_scale / sqrt(0.5)]
-        else:
-            widths = [scales[i] * sqrt(ar) for ar in aspect_ratios]
-            heights = [scales[i] / sqrt(ar) for ar in aspect_ratios]
+        widths, heights = priors_generator.get(i)

-            widths += [sqrt(scales[i] * scales[i + 1])]
-            heights += [sqrt(scales[i] * scales[i + 1])]
-        widths = [w * image_width for w in widths]
-        heights = [h * image_height for h in heights]
        priorBox.addAttr('width', widths)
        priorBox.addAttr('height', heights)
        priorBox.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
@ -217,6 +306,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
        graph_def.node.extend([priorBox])
        priorBoxes.append(priorBox.name)

+    # Compare this layer's output with Postprocessor/Reshape
    addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')

    # Sigmoid for classes predictions and DetectionOutput layer