diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad6ef10448..82c18f3508 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -577,7 +577,7 @@ else()
     # Note: layout differs from OpenCV 3.4
     include(GNUInstallDirs)
     ocv_update(OPENCV_INCLUDE_INSTALL_PATH       "${CMAKE_INSTALL_INCLUDEDIR}/opencv4")
-    ocv_update(OPENCV_LIB_INSTALL_PATH           "${CMAKE_INSTALL_LIBDIR}${LIB_SUFFIX}")
+    ocv_update(OPENCV_LIB_INSTALL_PATH           "${CMAKE_INSTALL_LIBDIR}")
     ocv_update(OPENCV_CONFIG_INSTALL_PATH        "${OPENCV_LIB_INSTALL_PATH}/cmake/opencv4")
     ocv_update(OPENCV_3P_LIB_INSTALL_PATH        "${OPENCV_LIB_INSTALL_PATH}/opencv4/3rdparty")
     ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH   "${CMAKE_INSTALL_DATAROOTDIR}/opencv4/samples")
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index a7ef8e1389..1d079a38d1 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -144,6 +144,7 @@ if(DOXYGEN_FOUND)
   string(REPLACE ";" " " CMAKE_DOXYGEN_ENABLED_SECTIONS "${CMAKE_DOXYGEN_ENABLED_SECTIONS}")
   # TODO: remove paths_doc from EXAMPLE_PATH after face module tutorials/samples moved to separate folders
   string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_EXAMPLE_PATH  "${example_path} ; ${paths_doc} ; ${paths_sample}")
+  string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_INCLUDE_ROOTS "${paths_include}")
   set(CMAKE_DOXYGEN_LAYOUT "${CMAKE_CURRENT_BINARY_DIR}/DoxygenLayout.xml")
   set(CMAKE_DOXYGEN_OUTPUT_PATH "doxygen")
   set(CMAKE_DOXYGEN_MAIN_REFERENCE "${refs_main}")
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 9ac60f1b44..85f8fe161f 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -22,8 +22,8 @@ ABBREVIATE_BRIEF       = "The $name class" \
 ALWAYS_DETAILED_SEC    = NO
 INLINE_INHERITED_MEMB  = NO
 FULL_PATH_NAMES        = YES
-STRIP_FROM_PATH        = @CMAKE_SOURCE_DIR@/modules
-STRIP_FROM_INC_PATH    =
+STRIP_FROM_PATH        = @CMAKE_SOURCE_DIR@/modules @CMAKE_DOXYGEN_INCLUDE_ROOTS@
+STRIP_FROM_INC_PATH    = @CMAKE_DOXYGEN_INCLUDE_ROOTS@
 SHORT_NAMES            = NO
 JAVADOC_AUTOBRIEF      = NO
 QT_AUTOBRIEF           = NO
@@ -72,8 +72,8 @@ INTERNAL_DOCS          = NO
 CASE_SENSE_NAMES       = YES
 HIDE_SCOPE_NAMES       = NO
 SHOW_INCLUDE_FILES     = YES
-SHOW_GROUPED_MEMB_INC  = NO
-FORCE_LOCAL_INCLUDES   = YES
+SHOW_GROUPED_MEMB_INC  = YES
+FORCE_LOCAL_INCLUDES   = NO
 INLINE_INFO            = YES
 SORT_MEMBER_DOCS       = YES
 SORT_BRIEF_DOCS        = YES
diff --git a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
index 8ab4c53908..ca7853d96a 100644
--- a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
+++ b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
@@ -53,8 +53,8 @@ import numpy as np
 import cv2 as cv
 import matplotlib.pyplot as plt
 
-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage
 
 # Initiate ORB detector
 orb = cv.ORB_create()
@@ -79,7 +79,7 @@ matches = bf.match(des1,des2)
 matches = sorted(matches, key = lambda x:x.distance)
 
 # Draw first 10 matches.
-img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10], flags=2)
+img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10],None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
 
 plt.imshow(img3),plt.show()
 @endcode
@@ -104,13 +104,13 @@ so that we can apply ratio test explained by D.Lowe in his paper.
 @code{.py}
 import numpy as np
 import cv2 as cv
-from matplotlib import pyplot as plt
+import matplotlib.pyplot as plt
 
-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage
 
 # Initiate SIFT detector
-sift = cv.SIFT()
+sift = cv.xfeatures2d.SIFT_create()
 
 # find the keypoints and descriptors with SIFT
 kp1, des1 = sift.detectAndCompute(img1,None)
@@ -118,7 +118,7 @@ kp2, des2 = sift.detectAndCompute(img2,None)
 
 # BFMatcher with default params
 bf = cv.BFMatcher()
-matches = bf.knnMatch(des1,des2, k=2)
+matches = bf.knnMatch(des1,des2,k=2)
 
 # Apply ratio test
 good = []
@@ -127,7 +127,7 @@ for m,n in matches:
         good.append([m])
 
 # cv.drawMatchesKnn expects list of lists as matches.
-img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,flags=2)
+img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
 
 plt.imshow(img3),plt.show()
 @endcode
@@ -168,13 +168,13 @@ With this information, we are good to go.
 @code{.py}
 import numpy as np
 import cv2 as cv
-from matplotlib import pyplot as plt
+import matplotlib.pyplot as plt
 
-img1 = cv.imread('box.png',0)          # queryImage
-img2 = cv.imread('box_in_scene.png',0) # trainImage
+img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE)          # queryImage
+img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage
 
 # Initiate SIFT detector
-sift = cv.SIFT()
+sift = cv.xfeatures2d.SIFT_create()
 
 # find the keypoints and descriptors with SIFT
 kp1, des1 = sift.detectAndCompute(img1,None)
@@ -190,7 +190,7 @@ flann = cv.FlannBasedMatcher(index_params,search_params)
 matches = flann.knnMatch(des1,des2,k=2)
 
 # Need to draw only good matches, so create a mask
-matchesMask = [[0,0] for i in xrange(len(matches))]
+matchesMask = [[0,0] for i in range(len(matches))]
 
 # ratio test as per Lowe's paper
 for i,(m,n) in enumerate(matches):
@@ -200,7 +200,7 @@ for i,(m,n) in enumerate(matches):
 draw_params = dict(matchColor = (0,255,0),
                    singlePointColor = (255,0,0),
                    matchesMask = matchesMask,
-                   flags = 0)
+                   flags = cv.DrawMatchesFlags_DEFAULT)
 
 img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,matches,None,**draw_params)
 
diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp
index 0a21231c4d..481fafd452 100644
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@@ -156,7 +156,7 @@ void CirclesGridClusterFinder::findGrid(const std::vector<cv::Point2f> &points,
 #endif
 
   std::vector<Point2f> hull2f;
-  convexHull(Mat(patternPoints), hull2f, false);
+  convexHull(patternPoints, hull2f, false);
   const size_t cornersCount = isAsymmetricGrid ? 6 : 4;
   if(hull2f.size() < cornersCount)
     return;
@@ -407,7 +407,7 @@ void CirclesGridClusterFinder::rectifyPatternPoints(const std::vector<cv::Point2
     }
   }
 
-  Mat homography = findHomography(Mat(sortedCorners), Mat(idealPoints), 0);
+  Mat homography = findHomography(sortedCorners, idealPoints, 0);
   Mat rectifiedPointsMat;
   transform(patternPoints, rectifiedPointsMat, homography);
   rectifiedPatternPoints.clear();
@@ -863,8 +863,8 @@ Mat CirclesGridFinder::rectifyGrid(Size detectedGridSize, const std::vector<Poin
     }
   }
 
-  Mat H = findHomography(Mat(centers), Mat(dstPoints), RANSAC);
-  //Mat H = findHomography( Mat( corners ), Mat( dstPoints ) );
+  Mat H = findHomography(centers, dstPoints, RANSAC);
+  //Mat H = findHomography(corners, dstPoints);
 
   if (H.empty())
   {
@@ -880,7 +880,7 @@ Mat CirclesGridFinder::rectifyGrid(Size detectedGridSize, const std::vector<Poin
   }
 
   Mat dstKeypointsMat;
-  transform(Mat(srcKeypoints), dstKeypointsMat, H);
+  transform(srcKeypoints, dstKeypointsMat, H);
   std::vector<Point2f> dstKeypoints;
   convertPointsFromHomogeneous(dstKeypointsMat, dstKeypoints);
 
@@ -1168,7 +1168,7 @@ void CirclesGridFinder::findBasis(const std::vector<Point2f> &samples, std::vect
   }
   for (size_t i = 0; i < basis.size(); i++)
   {
-    convexHull(Mat(clusters[i]), hulls[i]);
+    convexHull(clusters[i], hulls[i]);
   }
 
   basisGraphs.resize(basis.size(), Graph(keypoints.size()));
@@ -1183,7 +1183,7 @@ void CirclesGridFinder::findBasis(const std::vector<Point2f> &samples, std::vect
 
       for (size_t k = 0; k < hulls.size(); k++)
       {
-        if (pointPolygonTest(Mat(hulls[k]), vec, false) >= 0)
+        if (pointPolygonTest(hulls[k], vec, false) >= 0)
         {
           basisGraphs[k].addEdge(i, j);
         }
@@ -1414,7 +1414,6 @@ void CirclesGridFinder::drawHoles(const Mat &srcImage, Mat &drawImage) const
       if (i != holes.size() - 1)
         line(drawImage, keypoints[holes[i][j]], keypoints[holes[i + 1][j]], Scalar(255, 0, 0), 2);
 
-      //circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness);
       circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness);
     }
   }
diff --git a/modules/calib3d/src/homography_decomp.cpp b/modules/calib3d/src/homography_decomp.cpp
index fea8882c5a..3bfb62ec2c 100644
--- a/modules/calib3d/src/homography_decomp.cpp
+++ b/modules/calib3d/src/homography_decomp.cpp
@@ -185,6 +185,10 @@ bool HomographyDecompZhang::findMotionFrom_tstar_n(const cv::Vec3d& tstar, const
     temp(1, 1) += 1.0;
     temp(2, 2) += 1.0;
     motion.R = getHnorm() * temp.inv();
+    if (cv::determinant(motion.R) < 0)
+    {
+        motion.R *= -1;
+    }
     motion.t = motion.R * tstar;
     motion.n = n;
     return passesSameSideOfPlaneConstraint(motion);
@@ -312,6 +316,10 @@ void HomographyDecompInria::findRmatFrom_tstar_n(const cv::Vec3d& tstar, const c
               0.0, 0.0, 1.0);
 
     R = getHnorm() * (I - (2/v) * tstar_m * n_m.t() );
+    if (cv::determinant(R) < 0)
+    {
+        R *= -1;
+    }
 }
 
 void HomographyDecompInria::decompose(std::vector<CameraMotion>& camMotions)
diff --git a/modules/calib3d/src/quadsubpix.cpp b/modules/calib3d/src/quadsubpix.cpp
index 77bc498591..b4100a22f9 100644
--- a/modules/calib3d/src/quadsubpix.cpp
+++ b/modules/calib3d/src/quadsubpix.cpp
@@ -194,9 +194,8 @@ bool cv::find4QuadCornerSubpix(InputArray _img, InputOutputArray _corners, Size
         erode(white_comp, white_comp, Mat(), Point(-1, -1), erode_count);
 
         std::vector<std::vector<Point> > white_contours, black_contours;
-        std::vector<Vec4i> white_hierarchy, black_hierarchy;
-        findContours(black_comp, black_contours, black_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE);
-        findContours(white_comp, white_contours, white_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE);
+        findContours(black_comp, black_contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
+        findContours(white_comp, white_contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
 
         if(black_contours.size() < 5 || white_contours.size() < 5) continue;
 
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index c013d5adf8..74dd272943 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -1408,7 +1408,7 @@ bool CV_StereoCalibrationTest::checkPandROI( int test_case_idx, const Mat& M, co
         for( x = 0; x < N; x++ )
             pts.push_back(Point2f((float)x*imgsize.width/(N-1), (float)y*imgsize.height/(N-1)));
 
-    undistortPoints(Mat(pts), upts, M, D, R, P );
+    undistortPoints(pts, upts, M, D, R, P );
     for( k = 0; k < N*N; k++ )
         if( upts[k].x < -imgsize.width*eps || upts[k].x > imgsize.width*(1+eps) ||
             upts[k].y < -imgsize.height*eps || upts[k].y > imgsize.height*(1+eps) )
@@ -1717,8 +1717,8 @@ void CV_StereoCalibrationTest::run( int )
         for( int i = 0, k = 0; i < nframes; i++ )
         {
             vector<Point2f> temp[2];
-            undistortPoints(Mat(imgpt1[i]), temp[0], M1, D1, R1, P1);
-            undistortPoints(Mat(imgpt2[i]), temp[1], M2, D2, R2, P2);
+            undistortPoints(imgpt1[i], temp[0], M1, D1, R1, P1);
+            undistortPoints(imgpt2[i], temp[1], M2, D2, R2, P2);
 
             for( int j = 0; j < npoints; j++, k++ )
             {
diff --git a/modules/calib3d/test/test_cameracalibration_artificial.cpp b/modules/calib3d/test/test_cameracalibration_artificial.cpp
index 165a66a7b1..a8351b6b66 100644
--- a/modules/calib3d/test/test_cameracalibration_artificial.cpp
+++ b/modules/calib3d/test/test_cameracalibration_artificial.cpp
@@ -353,7 +353,7 @@ protected:
         rvecs_spnp.resize(brdsNum);
         tvecs_spnp.resize(brdsNum);
         for(size_t i = 0; i < brdsNum; ++i)
-            solvePnP(Mat(objectPoints[i]), Mat(imagePoints[i]), camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]);
+            solvePnP(objectPoints[i], imagePoints[i], camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]);
 
         compareShiftVecs(tvecs_exp, tvecs_spnp);
         compareRotationVecs(rvecs_exp, rvecs_spnp);
diff --git a/modules/calib3d/test/test_chessboardgenerator.cpp b/modules/calib3d/test/test_chessboardgenerator.cpp
index 3a8c17345f..6926cb6e72 100644
--- a/modules/calib3d/test/test_chessboardgenerator.cpp
+++ b/modules/calib3d/test/test_chessboardgenerator.cpp
@@ -126,10 +126,10 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
                 generateEdge(p3, p4, pts_square3d);
                 generateEdge(p4, p1, pts_square3d);
 
-                projectPoints(Mat(pts_square3d), rvec, tvec, camMat, distCoeffs, pts_square2d);
+                projectPoints(pts_square3d, rvec, tvec, camMat, distCoeffs, pts_square2d);
                 squares_black.resize(squares_black.size() + 1);
                 vector<Point2f> temp;
-                approxPolyDP(Mat(pts_square2d), temp, 1.0, true);
+                approxPolyDP(pts_square2d, temp, 1.0, true);
                 transform(temp.begin(), temp.end(), back_inserter(squares_black.back()), Mult(rendererResolutionMultiplier));
             }
 
@@ -139,7 +139,7 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
         for(int i = 0; i < patternSize.width - 1; ++i)
             corners3d.push_back(zero + (i + 1) * sqWidth * pb1 + (j + 1) * sqHeight * pb2);
     corners.clear();
-    projectPoints(Mat(corners3d), rvec, tvec, camMat, distCoeffs, corners);
+    projectPoints(corners3d, rvec, tvec, camMat, distCoeffs, corners);
 
     vector<Point3f> whole3d;
     vector<Point2f> whole2d;
@@ -147,9 +147,9 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co
     generateEdge(whole[1], whole[2], whole3d);
     generateEdge(whole[2], whole[3], whole3d);
     generateEdge(whole[3], whole[0], whole3d);
-    projectPoints(Mat(whole3d), rvec, tvec, camMat, distCoeffs, whole2d);
+    projectPoints(whole3d, rvec, tvec, camMat, distCoeffs, whole2d);
     vector<Point2f> temp_whole2d;
-    approxPolyDP(Mat(whole2d), temp_whole2d, 1.0, true);
+    approxPolyDP(whole2d, temp_whole2d, 1.0, true);
 
     vector< vector<Point > > whole_contour(1);
     transform(temp_whole2d.begin(), temp_whole2d.end(),
@@ -213,7 +213,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
         pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;
 
         /* can remake with better perf */
-        projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+        projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);
 
         bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0;
         bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0;
@@ -278,7 +278,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
         pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;
 
         /* can remake with better perf */
-        projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+        projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);
 
         bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0;
         bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0;
@@ -320,7 +320,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
     pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2;
 
     /* can remake with better perf */
-    projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d);
+    projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d);
 
     Point3f zero = p - pb1 * cbHalfWidth - cbHalfHeight * pb2;
 
diff --git a/modules/calib3d/test/test_homography_decomp.cpp b/modules/calib3d/test/test_homography_decomp.cpp
index 45f5ae63ee..9ddc0e913d 100644
--- a/modules/calib3d/test/test_homography_decomp.cpp
+++ b/modules/calib3d/test/test_homography_decomp.cpp
@@ -134,4 +134,36 @@ private:
 
 TEST(Calib3d_DecomposeHomography, regression) { CV_HomographyDecompTest test; test.safe_run(); }
 
+
+TEST(Calib3d_DecomposeHomography, issue_4978)
+{
+    Matx33d K(
+        1.0,   0.0,    0.0,
+        0.0,   1.0,    0.0,
+        0.0,   0.0,    1.0
+    );
+
+    Matx33d H(
+        -0.102896, 0.270191,   -0.0031153,
+        0.0406387, 1.19569,    -0.0120456,
+        0.445351,  0.0410889,  1
+    );
+
+    vector<Mat> rotations;
+    vector<Mat> translations;
+    vector<Mat> normals;
+
+    decomposeHomographyMat(H, K, rotations, translations, normals);
+
+    ASSERT_GT(rotations.size(), (size_t)0u);
+    for (size_t i = 0; i < rotations.size(); i++)
+    {
+        // check: det(R) = 1
+        EXPECT_TRUE(std::fabs(cv::determinant(rotations[i]) - 1.0) < 0.01)
+            << "R: det=" << cv::determinant(rotations[0]) << std::endl << rotations[i] << std::endl
+            << "T:" << std::endl << translations[i] << std::endl;
+    }
+}
+
+
 }} // namespace
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index 8eec7a7167..2359fa9282 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -124,7 +124,7 @@ protected:
 
         vector<Point2f> projectedPoints;
         projectedPoints.resize(points.size());
-        projectPoints(Mat(points), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+        projectPoints(points, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
         for (size_t i = 0; i < projectedPoints.size(); i++)
         {
             if (i % 20 == 0)
@@ -241,7 +241,7 @@ protected:
 
         vector<Point2f> projectedPoints;
         projectedPoints.resize(opoints.size());
-        projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+        projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
 
         bool isEstimateSuccess = solvePnP(opoints, projectedPoints, intrinsics, distCoeffs, rvec, tvec, false, method);
         if (isEstimateSuccess == false)
@@ -291,7 +291,7 @@ class CV_solveP3P_Test : public CV_solvePnPRansac_Test
 
     vector<Point2f> projectedPoints;
     projectedPoints.resize(opoints.size());
-    projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+    projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
 
     int num_of_solutions = solveP3P(opoints, projectedPoints, intrinsics, distCoeffs, rvecs, tvecs, method);
     if (num_of_solutions != (int) rvecs.size() || num_of_solutions != (int) tvecs.size() || num_of_solutions == 0)
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index bf9519dd1c..0e2c66b1f1 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -186,6 +186,16 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 #  endif
 #endif
 
+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
 #if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
 #  define CV_ENABLE_UNROLLED 0
 #else
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index b41efdae1b..b14f4f66cc 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20181221
+#define OPENCV_DNN_API_VERSION 20190122
 
 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index cc95cc58ae..d06689a7fb 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -157,8 +157,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
 
 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)))
+    if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
                Mat(cv::Size(224, 224), CV_32FC3));
@@ -211,8 +210,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 
 PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+    if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/dog416.png", false));
     Mat inp;
@@ -222,8 +220,11 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 
 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+    if (backend == DNN_BACKEND_HALIDE
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE < 2018030000
+        || (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+#endif
+    )
         throw SkipTestException("");
     processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
 }
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 1282f59e8f..95643a287b 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -707,12 +707,6 @@ struct DataLayer : public Layer
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
-        InferenceEngine::LayerParams lp;
-        lp.name = name;
-        lp.type = "ScaleShift";
-        lp.precision = InferenceEngine::Precision::FP32;
-        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
-
         CV_CheckEQ(inputsData.size(), (size_t)1, "");
         CV_CheckEQ(inputsData[0].dims, 4, "");
         const size_t numChannels = inputsData[0].size[1];
@@ -723,7 +717,6 @@ struct DataLayer : public Layer
                                                                 {numChannels});
         weights->allocate();
         weights->set(std::vector<float>(numChannels, scaleFactors[0]));
-        ieLayer->_weights = weights;
 
         // Mean subtraction
         auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
@@ -735,8 +728,21 @@ struct DataLayer : public Layer
             biasesVec[i] = -means[0][i] * scaleFactors[0];
         }
         biases->set(biasesVec);
-        ieLayer->_biases = biases;
 
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+        ieLayer.setWeights(weights);
+        ieLayer.setBiases(biases);
+#else
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "ScaleShift";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
+
+        ieLayer->_weights = weights;
+        ieLayer->_biases = biases;
+#endif
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
@@ -1480,7 +1486,11 @@ struct Net::Impl
                 if (layerNet != ieInpNode->net)
                 {
                     // layerNet is empty or nodes are from different graphs.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+                    ieInpNode->net->addOutput(ieInpNode->layer.getName());
+#else
                     ieInpNode->net->addOutput(ieInpNode->layer->name);
+#endif
                 }
             }
         }
@@ -1590,7 +1600,7 @@ struct Net::Impl
 
         // Build Inference Engine networks from sets of layers that support this
         // backend. Split a whole model on several Inference Engine networks if
-        // some of layers is not implemented.
+        // some of layers are not implemented.
 
         // Set of all input and output blobs wrappers for current network.
         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
@@ -1606,7 +1616,7 @@ struct Net::Impl
             {
                 addInfEngineNetOutputs(ld);
                 net = Ptr<InfEngineBackendNet>();
-                netBlobsWrappers.clear();
+                netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
                 layer->preferableTarget = DNN_TARGET_CPU;
                 continue;
             }
@@ -1624,12 +1634,13 @@ struct Net::Impl
                     if (ieInpNode->net != net)
                     {
                         net = Ptr<InfEngineBackendNet>();
-                        netBlobsWrappers.clear();
+                        netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
                         break;
                     }
                 }
             }
 
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
             // The same blobs wrappers cannot be shared between two Inference Engine
             // networks because of explicit references between layers and blobs.
             // So we need to rewrap all the external blobs.
@@ -1646,6 +1657,7 @@ struct Net::Impl
                     ld.inputBlobsWrappers[i] = it->second;
             }
             netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0];
+#endif  // IE < R5
 
             Ptr<BackendNode> node;
             if (!net.empty())
@@ -1676,6 +1688,40 @@ struct Net::Impl
             CV_Assert(!ieNode.empty());
             ieNode->net = net;
 
+            // Convert weights in FP16 for specific targets.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+            if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
+                 preferableTarget == DNN_TARGET_MYRIAD ||
+                 preferableTarget == DNN_TARGET_FPGA) && !fused)
+            {
+                auto& blobs = ieNode->layer.getConstantData();
+                if (blobs.empty())
+                {
+                    // In case of non weightable layer we have to specify
+                    // it's precision adding dummy blob.
+                    auto blob = InferenceEngine::make_shared_blob<int16_t>(
+                                    InferenceEngine::Precision::FP16,
+                                    InferenceEngine::Layout::C, {1});
+                    blob->allocate();
+                    blobs[""] = blob;
+                }
+                else
+                {
+                    for (auto& it : blobs)
+                        it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
+                }
+            }
+
+            if (!fused)
+                net->addLayer(ieNode->layer);
+
+            net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
+            net->addBlobs(ld.inputBlobsWrappers);
+            net->addBlobs(ld.outputBlobsWrappers);
+            addInfEngineNetOutputs(ld);
+
+#else  // IE >= R5
+
             auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
                  preferableTarget == DNN_TARGET_MYRIAD ||
@@ -1713,10 +1759,10 @@ struct Net::Impl
             if (!fused)
                 net->addLayer(ieNode->layer);
             addInfEngineNetOutputs(ld);
+#endif  // IE >= R5
         }
 
         // Initialize all networks.
-        std::set<InfEngineBackendNet> initializedNets;
         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
         {
             LayerData &ld = it->second;
@@ -2622,7 +2668,11 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
     Net cvNet;
     cvNet.setInputsNames(inputsNames);
 
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
+#else
     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(0));
+#endif
     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
     for (auto& it : ieNet.getOutputsInfo())
     {
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 9a1707a3e8..522d0229ba 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -349,6 +349,14 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+
+        const size_t numChannels = weights_.total();
+        ieLayer.setWeights(wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C));
+        ieLayer.setBiases(wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "ScaleShift";
@@ -360,6 +368,7 @@ public:
         ieLayer->_biases = wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 1eb149b3d1..9f8590bea7 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -110,6 +110,11 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::SplitLayer ieLayer(name);
+        ieLayer.setOutputPorts({InferenceEngine::Port()});
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
         CV_Assert(!input->dims.empty());
 
@@ -123,6 +128,7 @@ public:
         ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]);
 #endif
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index bea2017729..19ab915ea6 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -313,6 +313,14 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::ConcatLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axis, input->dims.size()));
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
         InferenceEngine::LayerParams lp;
         lp.name = name;
@@ -321,6 +329,7 @@ public:
         std::shared_ptr<InferenceEngine::ConcatLayer> ieLayer(new InferenceEngine::ConcatLayer(lp));
         ieLayer->_axis = clamp(axis, input->dims.size());
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index fd31d9cc12..21a13c8d47 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -521,6 +521,54 @@ public:
         const int inpGroupCn = blobs[0].size[1];
         const int group = inpCn / inpGroupCn;
 
+        auto ieWeights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (newWeightAndBias)
+        {
+            if (weightsMat.isContinuous())
+            {
+                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
+                ieWeights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
+            }
+            else
+            {
+                ieWeights = InferenceEngine::make_shared_blob<float>(
+                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
+                                    ieWeights->dims());
+                ieWeights->allocate();
+
+                Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
+                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
+                fusedWeights.copyTo(newWeights);
+            }
+        }
+        InferenceEngine::Blob::Ptr ieBiases;
+        if (hasBias() || fusedBias)
+        {
+            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
+            ieBiases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C);
+        }
+
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ConvolutionLayer ieLayer(name);
+
+        ieLayer.setKernel({kernel.height, kernel.width});
+        ieLayer.setStrides({stride.height, stride.width});
+        ieLayer.setDilation({dilation.height, dilation.width});
+        ieLayer.setPaddingsBegin({pad.height, pad.width});
+        ieLayer.setPaddingsEnd({pad.height, pad.width});
+        ieLayer.setGroup(group);
+        ieLayer.setOutDepth(outCn);
+
+        ieLayer.setWeights(ieWeights);
+        if (ieBiases)
+            ieLayer.setBiases(ieBiases);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!padMode.empty())
+            l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Convolution";
@@ -557,32 +605,11 @@ public:
         ieLayer->_out_depth = outCn;
         ieLayer->_group = group;
 
-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
-        if (newWeightAndBias)
-        {
-            if (weightsMat.isContinuous())
-            {
-                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
-                ieLayer->_weights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
-            }
-            else
-            {
-                ieLayer->_weights = InferenceEngine::make_shared_blob<float>(
-                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
-                                    ieLayer->_weights->dims());
-                ieLayer->_weights->allocate();
-
-                Mat newWeights = infEngineBlobToMat(ieLayer->_weights).reshape(1, outCn);
-                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
-                fusedWeights.copyTo(newWeights);
-            }
-        }
-        if (hasBias() || fusedBias)
-        {
-            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
-            ieLayer->_biases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C);
-        }
+        ieLayer->_weights = ieWeights;
+        if (ieBiases)
+            ieLayer->_biases = ieBiases;
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
@@ -1193,6 +1220,9 @@ public:
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         {
+            if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width))
+                return false;
+
             const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
             const int group = numOutput / outGroupCn;
             if (group != 1)
@@ -1747,6 +1777,27 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
+        const int group = numOutput / outGroupCn;
+
+        InferenceEngine::Builder::DeconvolutionLayer ieLayer(name);
+
+        ieLayer.setKernel({kernel.height, kernel.width});
+        ieLayer.setStrides({stride.height, stride.width});
+        ieLayer.setDilation({dilation.height, dilation.width});
+        ieLayer.setPaddingsBegin({pad.height, pad.width});
+        ieLayer.setPaddingsEnd({pad.height, pad.width});
+        ieLayer.setGroup(group);
+        ieLayer.setOutDepth(numOutput);
+
+        ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW));
+        if (hasBias())
+        {
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C));
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
         const int group = numOutput / outGroupCn;
 
@@ -1786,6 +1837,7 @@ public:
             ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C);
         }
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/crop_layer.cpp b/modules/dnn/src/layers/crop_layer.cpp
index 32cdbbaa00..c7cd99c9aa 100644
--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@@ -67,8 +67,12 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE && crop_ranges.size() == 4);
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) && crop_ranges.size() == 4;
+        else
+#endif
+            return backendId == DNN_BACKEND_OPENCV;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -145,9 +149,10 @@ public:
         input(&crop_ranges[0]).copyTo(outputs[0]);
     }
 
+#ifdef HAVE_INF_ENGINE
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
-#ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Crop";
@@ -181,9 +186,11 @@ public:
         ieLayer->dim.push_back(crop_ranges[3].end - crop_ranges[3].start);
 #endif
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-#endif  // HAVE_INF_ENGINE
+#else
         return Ptr<BackendNode>();
+#endif  // IE < R5
     }
+#endif
 
     std::vector<Range> crop_ranges;
 };
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 2a21619d6c..cc87a120a8 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -939,6 +939,25 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::DetectionOutputLayer ieLayer(name);
+
+        ieLayer.setNumClasses(_numClasses);
+        ieLayer.setShareLocation(_shareLocation);
+        ieLayer.setBackgroudLabelId(_backgroundLabelId);
+        ieLayer.setNMSThreshold(_nmsThreshold);
+        ieLayer.setTopK(_topK);
+        ieLayer.setKeepTopK(_keepTopK);
+        ieLayer.setConfidenceThreshold(_confidenceThreshold);
+        ieLayer.setVariantEncodedInTarget(_varianceEncodedInTarget);
+        ieLayer.setCodeType("caffe.PriorBoxParameter." + _codeType);
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(3));
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["eta"] = std::string("1.0");
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "DetectionOutput";
@@ -956,6 +975,7 @@ public:
         ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
         ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 4ab8ed3a44..8fb596252f 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -153,10 +153,16 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer = func.initInfEngineBuilderAPI();
+        ieLayer.setName(this->name);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = this->name;
         lp.precision = InferenceEngine::Precision::FP32;
         return Ptr<BackendNode>(new InfEngineBackendNode(func.initInfEngine(lp)));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
@@ -355,6 +361,12 @@ struct ReLUFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(slope);
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "ReLU";
@@ -363,6 +375,7 @@ struct ReLUFunctor
         ieLayer->params["negative_slope"] = format("%f", slope);
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -472,6 +485,12 @@ struct ReLU6Functor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ClampLayer("").setMinValue(minValue).setMaxValue(maxValue);
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "Clamp";
@@ -482,6 +501,7 @@ struct ReLU6Functor
         ieLayer->params["max"] = format("%f", maxValue);
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -558,12 +578,19 @@ struct TanHFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::TanHLayer("");
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "TanH";
         std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -640,12 +667,19 @@ struct SigmoidFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::SigmoidLayer("");
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "Sigmoid";
         std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -724,11 +758,18 @@ struct ELUFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ELULayer("");
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "ELU";
         return InferenceEngine::CNNLayerPtr(new InferenceEngine::CNNLayer(lp));
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -805,6 +846,12 @@ struct AbsValFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(-1);
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "ReLU";
@@ -813,6 +860,7 @@ struct AbsValFunctor
         ieLayer->params["negative_slope"] = "-1.0";
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -868,11 +916,18 @@ struct BNLLFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         CV_Error(Error::StsNotImplemented, "BNLL");
         return InferenceEngine::CNNLayerPtr();
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -985,6 +1040,14 @@ struct PowerFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        return InferenceEngine::Builder::PowerLayer("").setPower(power)
+                                                       .setScale(scale)
+                                                       .setShift(shift);
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         if (power == 1.0f && scale == 1.0f && shift == 0.0f)
@@ -1004,6 +1067,7 @@ struct PowerFunctor
             return ieLayer;
         }
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
@@ -1143,6 +1207,15 @@ struct ChannelsPReLUFunctor
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        InferenceEngine::Builder::PReLULayer ieLayer("");
+        const size_t numChannels = scale.total();
+        ieLayer.setWeights(wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C));
+        return ieLayer;
+    }
+#else
     InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
     {
         lp.type = "PReLU";
@@ -1151,6 +1224,7 @@ struct ChannelsPReLUFunctor
         ieLayer->_weights = wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C);
         return ieLayer;
     }
+#endif
 #endif  // HAVE_INF_ENGINE
 
 #ifdef HAVE_VULKAN
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index e0895b7f45..ed6da9e1a4 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -99,7 +99,7 @@ public:
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_HALIDE ||
                (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
-                (preferableTarget != DNN_TARGET_MYRIAD || coeffs.empty()));
+                (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -420,9 +420,29 @@ public:
         return Ptr<BackendNode>();
     }
 
-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::EltwiseLayer ieLayer(name);
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+
+        if (op == SUM)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
+        else if (op == PROD)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
+        else if (op == MAX)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!coeffs.empty())
+            l.getParameters()["coeff"] = coeffs;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Eltwise";
@@ -438,6 +458,7 @@ public:
         else
             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index e3382f2d53..3a704dca81 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -152,9 +152,19 @@ public:
         }
     }
 
-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Flatten");
+        ieLayer.getParameters()["axis"] = _startAxis;
+        ieLayer.getParameters()["end_axis"] = _endAxis;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Flatten";
@@ -163,6 +173,7 @@ public:
         ieLayer->params["axis"] = format("%d", _startAxis);
         ieLayer->params["end_axis"] = format("%d", _endAxis);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 78d3e809b5..3a71a872fe 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -442,6 +442,18 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
+
+        const int outNum = blobs[0].size[0];
+        ieLayer.setOutputNum(outNum);
+
+        ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW));
+        if (blobs.size() > 1)
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C));
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "FullyConnected";
@@ -456,6 +468,7 @@ public:
         if (blobs.size() > 1)
             ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index b92610272b..fbd0c6ac59 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -393,6 +393,17 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::NormLayer ieLayer(name);
+        ieLayer.setSize(size);
+        ieLayer.setAlpha(alpha);
+        ieLayer.setBeta(beta);
+        ieLayer.setAcrossMaps(type == CHANNEL_NRM);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["k"] = bias;
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Norm";
@@ -405,6 +416,7 @@ public:
         ieLayer->_alpha = alpha;
         ieLayer->_isAcrossMaps = (type == CHANNEL_NRM);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 93dd5f05f6..772902ca01 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -371,6 +371,13 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::MVNLayer ieLayer(name);
+        ieLayer.setAcrossChannels(acrossChannels);
+        ieLayer.setNormalize(normVariance);
+        ieLayer.setEpsilon(eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "MVN";
@@ -380,6 +387,7 @@ public:
         ieLayer->params["normalize_variance"] = normVariance ? "1" : "0";
         ieLayer->params["eps"] = format("%f", eps);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index b3ca64f24a..4766f1704e 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -264,6 +264,49 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        if (input->dims.size() == 4)
+        {
+            InferenceEngine::Builder::NormalizeLayer ieLayer(name);
+
+            ieLayer.setChannelShared(false);
+            ieLayer.setAcrossMaps(acrossSpatial);
+            ieLayer.setEpsilon(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            const int numChannels = input->dims[2];  // NOTE: input->dims are reversed (whcn)
+            if (blobs.empty())
+            {
+                auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                                        InferenceEngine::Layout::C,
+                                                                        {(size_t)numChannels});
+                weights->allocate();
+                std::vector<float> ones(numChannels, 1);
+                weights->set(ones);
+                l.addConstantData("weights", weights);
+                l.getParameters()["channel_shared"] = false;
+            }
+            else
+            {
+                CV_Assert(numChannels == blobs[0].total());
+                l.addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)numChannels}, InferenceEngine::Layout::C));
+                l.getParameters()["channel_shared"] = blobs[0].total() == 1;
+            }
+            l.getParameters()["across_spatial"] = acrossSpatial;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::GRNLayer ieLayer(name);
+            ieLayer.setBeta(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            l.getParameters()["bias"] = epsilon;
+
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+#else
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::LayerParams lp;
@@ -307,6 +350,7 @@ public:
             ieLayer->params["bias"] = format("%f", epsilon);
             return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
         }
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index ace567e182..bade509169 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -385,6 +385,11 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::PermuteLayer ieLayer(name);
+        ieLayer.setOrder(_order);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Permute";
@@ -397,6 +402,7 @@ public:
             ieLayer->params["order"] += format(",%zu", _order[i]);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 11fa7eaeab..bfcc1068e1 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -295,6 +295,48 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        if (type == MAX || type == AVE)
+        {
+            InferenceEngine::Builder::PoolingLayer ieLayer(name);
+            ieLayer.setKernel({kernel.height, kernel.width});
+            ieLayer.setStrides({stride.height, stride.width});
+            ieLayer.setPaddingsBegin({pad_t, pad_l});
+            ieLayer.setPaddingsEnd({pad_b, pad_r});
+            ieLayer.setPoolingType(type == MAX ?
+                                   InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
+                                   InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
+            ieLayer.setRoundingType(ceilMode ?
+                                    InferenceEngine::Builder::PoolingLayer::RoundingType::CEIL :
+                                    InferenceEngine::Builder::PoolingLayer::RoundingType::FLOOR);
+            ieLayer.setExcludePad(type == AVE && padMode == "SAME");
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (!padMode.empty())
+                l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else if (type == ROI)
+        {
+            InferenceEngine::Builder::ROIPoolingLayer ieLayer(name);
+            ieLayer.setSpatialScale(spatialScale);
+            ieLayer.setPooled({pooledSize.height, pooledSize.width});
+            ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+            return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+        }
+        else if (type == PSROI)
+        {
+            InferenceEngine::Builder::PSROIPoolingLayer ieLayer(name);
+            ieLayer.setSpatialScale(spatialScale);
+            ieLayer.setOutputDim(psRoiOutChannels);
+            ieLayer.setGroupSize(pooledSize.width);
+            ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+            return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
+        return Ptr<BackendNode>();
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.precision = InferenceEngine::Precision::FP32;
@@ -353,6 +395,7 @@ public:
             CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 93b39827d6..fb690d76ef 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -498,6 +498,58 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        if (_explicitSizes)
+        {
+            InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name);
+
+            CV_Assert(_stepX == _stepY);
+            ieLayer.setStep(_stepX);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            l.getParameters()["width"] = _boxWidths;
+            l.getParameters()["height"] = _boxHeights;
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::PriorBoxLayer ieLayer(name);
+
+            CV_Assert(!_explicitSizes);
+
+            ieLayer.setMinSize(_minSize);
+            if (_maxSize > 0)
+                ieLayer.setMaxSize(_maxSize);
+
+            CV_Assert(_stepX == _stepY);
+            ieLayer.setStep(_stepX);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (!_aspectRatios.empty())
+            {
+                l.getParameters()["aspect_ratio"] = _aspectRatios;
+            }
+            CV_Assert(!_variance.empty());
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = _explicitSizes ? "PriorBoxClustered" : "PriorBox";
@@ -553,6 +605,7 @@ public:
         ieLayer->params["offset"] = format("%f", _offsetsX[0]);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index f559ee40e2..6514ed3a5c 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -328,6 +328,28 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ProposalLayer ieLayer(name);
+
+        ieLayer.setBaseSize(baseSize);
+        ieLayer.setFeatStride(featStride);
+        ieLayer.setMinSize(16);
+        ieLayer.setNMSThresh(nmsThreshold);
+        ieLayer.setPostNMSTopN(keepTopAfterNMS);
+        ieLayer.setPreNMSTopN(keepTopBeforeNMS);
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        ieLayer.setScale(scalesVec);
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        ieLayer.setRatio(ratiosVec);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Proposal";
@@ -353,6 +375,7 @@ public:
                 ieLayer->params["scale"] += format(",%f", scales.get<float>(i));
         }
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index a98f690e65..3e42db5de1 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -181,6 +181,11 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ReorgYoloLayer ieLayer(name);
+        ieLayer.setStride(reorgStride);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "ReorgYolo";
@@ -188,6 +193,7 @@ public:
         std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
         ieLayer->params["stride"] = format("%d", reorgStride);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index 4109802a66..d6290456fa 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -203,6 +203,17 @@ public:
         return true;
     }
 
+    void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!outputs.empty());
+        outShapes.resize(outputs.size());
+        for (int i = 0; i < outputs.size(); ++i)
+            outShapes[i] = shape(outputs[i]);
+    }
+
     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
         std::vector<UMat> inputs;
@@ -218,8 +229,7 @@ public:
             void *dst_handle = outputs[i].handle(ACCESS_WRITE);
             if (src_handle != dst_handle)
             {
-                MatShape outShape = shape(outputs[i]);
-                UMat umat = srcBlob.reshape(1, (int)outShape.size(), &outShape[0]);
+                UMat umat = srcBlob.reshape(1, (int)outShapes[i].size(), &outShapes[i][0]);
                 umat.copyTo(outputs[i]);
             }
         }
@@ -250,6 +260,12 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ReshapeLayer ieLayer(name);
+        CV_Assert(outShapes.size() == 1);
+        ieLayer.setDims(outShapes[0]);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Reshape";
@@ -265,9 +281,13 @@ public:
             ieLayer->shape = std::vector<int>(shapeSrc->dims.rbegin(), shapeSrc->dims.rend());
         }
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
+
+private:
+    std::vector<MatShape> outShapes;
 };
 
 Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index 6aa32150b6..03d806ad2c 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -163,6 +163,33 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (interpolation == "nearest")
+        {
+            ieLayer.setType("Resample");
+            ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST");
+            ieLayer.getParameters()["antialias"] = false;
+            if (scaleWidth != scaleHeight)
+                CV_Error(Error::StsNotImplemented, "resample with sw != sh");
+            ieLayer.getParameters()["factor"] = 1.0 / scaleWidth;
+        }
+        else if (interpolation == "bilinear")
+        {
+            ieLayer.setType("Interp");
+            ieLayer.getParameters()["pad_beg"] = 0;
+            ieLayer.getParameters()["pad_end"] = 0;
+            ieLayer.getParameters()["align_corners"] = false;
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.precision = InferenceEngine::Precision::FP32;
@@ -187,6 +214,7 @@ public:
         ieLayer->params["width"] = cv::format("%d", outWidth);
         ieLayer->params["height"] = cv::format("%d", outHeight);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
@@ -247,6 +275,18 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Interp");
+        ieLayer.getParameters()["pad_beg"] = 0;
+        ieLayer.getParameters()["pad_end"] = 0;
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Interp";
@@ -256,6 +296,7 @@ public:
         ieLayer->params["pad_beg"] = "0";
         ieLayer->params["pad_end"] = "0";
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index b217632584..a11fd379a2 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -197,6 +197,29 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::ScaleShiftLayer ieLayer(name);
+
+        CV_Assert(!blobs.empty());
+        const size_t numChannels = blobs[0].total();
+        if (hasWeights)
+        {
+            ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C));
+        }
+        else
+        {
+            auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                                    {numChannels});
+            weights->allocate();
+
+            std::vector<float> ones(numChannels, 1);
+            weights->set(ones);
+            ieLayer.setWeights(weights);
+        }
+        if (hasBias)
+            ieLayer.setBiases(wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "ScaleShift";
@@ -223,6 +246,7 @@ public:
             ieLayer->_biases = wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 66f9aea440..0821979376 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -110,8 +110,15 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1 && sliceRanges[0].size() == 4);
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+        {
+            return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) &&
+                   sliceRanges.size() == 1 && sliceRanges[0].size() == 4;
+        }
+        else
+#endif
+            return backendId == DNN_BACKEND_OPENCV;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -254,9 +261,10 @@ public:
         }
     }
 
+#ifdef HAVE_INF_ENGINE
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
-#ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
         InferenceEngine::LayerParams lp;
         lp.name = name;
@@ -286,10 +294,11 @@ public:
             ieLayer->dim.push_back(sliceRanges[0][i].end - sliceRanges[0][i].start);
         }
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-
-#endif  // HAVE_INF_ENGINE
+#else
         return Ptr<BackendNode>();
+#endif  // IE < R5
     }
+#endif
 };
 
 Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index ab4fd6d7ce..cdd91059ed 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -326,6 +326,13 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axisRaw, input->dims.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::LayerParams lp;
@@ -335,6 +342,7 @@ public:
         std::shared_ptr<InferenceEngine::SoftMaxLayer> ieLayer(new InferenceEngine::SoftMaxLayer(lp));
         ieLayer->axis = clamp(axisRaw, input->dims.size());
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
     }
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 7beec8a1a0..98de907b9e 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -18,6 +18,10 @@ namespace cv { namespace dnn {
 
 #ifdef HAVE_INF_ENGINE
 
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::Builder::Layer& _layer)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}
+#else
 InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& _layer)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}
 
@@ -40,6 +44,7 @@ void InfEngineBackendNode::connect(std::vector<Ptr<BackendWrapper> >& inputs,
     layer->outData[0] = dataPtr;
     dataPtr->creatorLayer = InferenceEngine::CNNLayerWeakPtr(layer);
 }
+#endif
 
 static std::vector<Ptr<InfEngineBackendWrapper> >
 infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
@@ -54,6 +59,129 @@ infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
     return wrappers;
 }
 
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+
+InfEngineBackendNet::InfEngineBackendNet() : netBuilder("")
+{
+    hasNetOwner = false;
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+}
+
+InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net)
+{
+    hasNetOwner = true;
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+}
+
+void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                  const std::vector<Ptr<BackendWrapper> >& outputs,
+                                  const std::string& layerName)
+{
+    std::vector<Ptr<InfEngineBackendWrapper> > inpWrappers = infEngineWrappers(inputs);
+    std::map<std::string, int>::iterator it = layers.find(layerName);
+    CV_Assert(it != layers.end());
+
+    const int layerId = it->second;
+    for (int i = 0; i < inpWrappers.size(); ++i)
+    {
+        const auto& inp = inpWrappers[i];
+        const std::string& inpName = inp->dataPtr->name;
+        int inpId;
+        it = layers.find(inpName);
+        if (it == layers.end())
+        {
+            InferenceEngine::Builder::InputLayer inpLayer(inpName);
+
+            std::vector<size_t> shape(inp->blob->dims());
+            std::reverse(shape.begin(), shape.end());
+
+            inpLayer.setPort(InferenceEngine::Port(shape));
+            inpId = netBuilder.addLayer(inpLayer);
+
+            layers.insert({inpName, inpId});
+        }
+        else
+            inpId = it->second;
+
+        netBuilder.connect(inpId, {layerId, i});
+        unconnectedLayersIds.erase(inpId);
+    }
+    CV_Assert(!outputs.empty());
+    InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
+    dataPtr->name = layerName;
+}
+
+void InfEngineBackendNet::init(int targetId)
+{
+    if (!hasNetOwner)
+    {
+        CV_Assert(!unconnectedLayersIds.empty());
+        for (int id : unconnectedLayersIds)
+        {
+            InferenceEngine::Builder::OutputLayer outLayer("myconv1");
+            netBuilder.addLayer({id}, outLayer);
+        }
+        cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build()));
+    }
+
+    switch (targetId)
+    {
+    case DNN_TARGET_CPU:
+        targetDevice = InferenceEngine::TargetDevice::eCPU;
+        break;
+    case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16:
+        targetDevice = InferenceEngine::TargetDevice::eGPU;
+        break;
+    case DNN_TARGET_MYRIAD:
+        targetDevice = InferenceEngine::TargetDevice::eMYRIAD;
+        break;
+    case DNN_TARGET_FPGA:
+        targetDevice = InferenceEngine::TargetDevice::eFPGA;
+        break;
+    default:
+        CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
+    }
+
+    for (const auto& name : requestedOutputs)
+    {
+        cnn.addOutput(name);
+    }
+
+    for (const auto& it : cnn.getInputsInfo())
+    {
+        const std::string& name = it.first;
+        auto blobIt = allBlobs.find(name);
+        CV_Assert(blobIt != allBlobs.end());
+        inpBlobs[name] = blobIt->second;
+        it.second->setPrecision(blobIt->second->precision());
+    }
+    for (const auto& it : cnn.getOutputsInfo())
+    {
+        const std::string& name = it.first;
+        auto blobIt = allBlobs.find(name);
+        CV_Assert(blobIt != allBlobs.end());
+        outBlobs[name] = blobIt->second;
+        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
+    }
+
+    initPlugin(cnn);
+}
+
+void InfEngineBackendNet::addLayer(const InferenceEngine::Builder::Layer& layer)
+{
+    int id = netBuilder.addLayer(layer);
+    const std::string& layerName = layer.getName();
+    CV_Assert(layers.insert({layerName, id}).second);
+    unconnectedLayersIds.insert(id);
+}
+
+void InfEngineBackendNet::addOutput(const std::string& name)
+{
+    requestedOutputs.push_back(name);
+}
+
+#endif  // IE >= R5
+
 static InferenceEngine::Layout estimateLayout(const Mat& m)
 {
     if (m.dims == 4)
@@ -148,6 +276,7 @@ void InfEngineBackendWrapper::setHostDirty()
 
 }
 
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
 InfEngineBackendNet::InfEngineBackendNet()
 {
     targetDevice = InferenceEngine::TargetDevice::eCPU;
@@ -491,6 +620,8 @@ void InfEngineBackendNet::init(int targetId)
         initPlugin(*this);
 }
 
+#endif  // IE < R5
+
 static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
 
 void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
@@ -566,7 +697,11 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
     auto wrappers = infEngineWrappers(ptrs);
     for (const auto& wrapper : wrappers)
     {
-        allBlobs.insert({wrapper->dataPtr->name, wrapper->blob});
+        std::string name = wrapper->dataPtr->name;
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        name = name.empty() ? "id1" : name;  // TODO: drop the magic input name.
+#endif
+        allBlobs.insert({name, wrapper->blob});
     }
 }
 
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 118e525d97..a224767f8d 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -35,6 +35,11 @@
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
 #define INF_ENGINE_VER_MAJOR_GE(ver) (((INF_ENGINE_RELEASE) / 10000) >= ((ver) / 10000))
+#define INF_ENGINE_VER_MAJOR_LT(ver) (((INF_ENGINE_RELEASE) / 10000) < ((ver) / 10000))
+
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+#include <ie_builders.hpp>
+#endif
 
 #endif  // HAVE_INF_ENGINE
 
@@ -42,6 +47,7 @@ namespace cv { namespace dnn {
 
 #ifdef HAVE_INF_ENGINE
 
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5)
 class InfEngineBackendNet : public InferenceEngine::ICNNNetwork
 {
 public:
@@ -146,17 +152,75 @@ private:
     void initPlugin(InferenceEngine::ICNNNetwork& net);
 };
 
+#else  // IE < R5
+
+class InfEngineBackendNet
+{
+public:
+    InfEngineBackendNet();
+
+    InfEngineBackendNet(InferenceEngine::CNNNetwork& net);
+
+    void addLayer(const InferenceEngine::Builder::Layer& layer);
+
+    void addOutput(const std::string& name);
+
+    void connect(const std::vector<Ptr<BackendWrapper> >& inputs,
+                 const std::vector<Ptr<BackendWrapper> >& outputs,
+                 const std::string& layerName);
+
+    bool isInitialized();
+
+    void init(int targetId);
+
+    void forward();
+
+    void initPlugin(InferenceEngine::ICNNNetwork& net);
+
+    void addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs);
+
+private:
+    InferenceEngine::Builder::Network netBuilder;
+
+    InferenceEngine::InferenceEnginePluginPtr enginePtr;
+    InferenceEngine::InferencePlugin plugin;
+    InferenceEngine::ExecutableNetwork netExec;
+    InferenceEngine::InferRequest infRequest;
+    InferenceEngine::BlobMap allBlobs;
+    InferenceEngine::BlobMap inpBlobs;
+    InferenceEngine::BlobMap outBlobs;
+    InferenceEngine::TargetDevice targetDevice;
+
+    InferenceEngine::CNNNetwork cnn;
+    bool hasNetOwner;
+
+    std::map<std::string, int> layers;
+    std::vector<std::string> requestedOutputs;
+
+    std::set<int> unconnectedLayersIds;
+};
+#endif  // IE < R5
+
 class InfEngineBackendNode : public BackendNode
 {
 public:
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InfEngineBackendNode(const InferenceEngine::Builder::Layer& layer);
+#else
     InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& layer);
+#endif
 
     void connect(std::vector<Ptr<BackendWrapper> >& inputs,
                  std::vector<Ptr<BackendWrapper> >& outputs);
 
-    InferenceEngine::CNNLayerPtr layer;
     // Inference Engine network object that allows to obtain the outputs of this layer.
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+    InferenceEngine::Builder::Layer layer;
     Ptr<InfEngineBackendNet> net;
+#else
+    InferenceEngine::CNNLayerPtr layer;
+    Ptr<InfEngineBackendNet> net;
+#endif
 };
 
 class InfEngineBackendWrapper : public BackendWrapper
diff --git a/modules/dnn/src/torch/THGeneral.cpp b/modules/dnn/src/torch/THGeneral.cpp
index 8a52745770..0c27edc6fb 100644
--- a/modules/dnn/src/torch/THGeneral.cpp
+++ b/modules/dnn/src/torch/THGeneral.cpp
@@ -1,10 +1,2 @@
 #include "../precomp.hpp"
-
-#if defined(TH_DISABLE_HEAP_TRACKING)
-#elif (defined(__unix) || defined(_WIN32))
-#include <malloc.h>
-#elif defined(__APPLE__)
-#include <malloc/malloc.h>
-#endif
-
 #include "THGeneral.h"
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 3a64d6485b..eef4f6ba79 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -180,7 +180,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
-    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 0.0;
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 2e-5;
     float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
     processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
                inp, "detection_out", "", l1, lInf, 0.25);
@@ -288,7 +288,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
     float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.3 : 4e-5;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.0 : 2e-3;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.28 : 2e-3;
     processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
 }
 
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 5d41b4b916..d7c14f2714 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -306,7 +306,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
     // batch size 1
     testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_MYRIAD)
 #endif
     // batch size 2
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index 468953fe7e..9cbfb0c402 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -163,7 +163,7 @@ TEST_P(Deconvolution, Accuracy)
     bool hasBias = get<6>(GetParam());
     Backend backendId = get<0>(get<7>(GetParam()));
     Target targetId = get<1>(get<7>(GetParam()));
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU &&
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_MYRIAD) &&
         dilation.width == 2 && dilation.height == 2)
         throw SkipTestException("");
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000
@@ -466,6 +466,7 @@ void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId)
     pool.set("stride_w", 2);
     pool.set("stride_h", 2);
     pool.type = "Pooling";
+    pool.name = "ave_pool";
 
     Net net;
     int poolId = net.addLayer(pool.name, pool.type, pool);
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 4ccefd28a9..62e625f03c 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -295,10 +295,6 @@ TEST_P(Test_Caffe_layers, Eltwise)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
-#endif
     testLayerUsingCaffeModels("layer_eltwise");
 }
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index deccbfb0eb..acdd66631c 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -351,6 +351,10 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
         l1 = 0.009;
         lInf = 0.035;
     }
+    else if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU) {
+        l1 = 4.5e-5;
+        lInf = 1.9e-4;
+    }
     testONNXModels("LResNet100E_IR", pb, l1, lInf);
 }
 
@@ -366,6 +370,10 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
         l1 = 0.021;
         lInf = 0.034;
     }
+    else if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_CPU || target == DNN_TARGET_OPENCL)) {
+        l1 = 2.4e-4;
+        lInf = 6e-4;
+    }
     testONNXModels("emotion_ferplus", pb, l1, lInf);
 }
 
@@ -389,7 +397,7 @@ TEST_P(Test_ONNX_nets, Inception_v1)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
+        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
 #endif
     testONNXModels("inception_v1", pb);
 }
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index ce4997cd4e..b20b166551 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -40,7 +40,7 @@ TEST(Test_TensorFlow, read_inception)
     ASSERT_TRUE(!sample.empty());
     Mat input;
     resize(sample, input, Size(224, 224));
-    input -= 128; // mean sub
+    input -= Scalar::all(117); // mean sub
 
     Mat inputBlob = blobFromImage(input);
 
@@ -351,8 +351,8 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
     Mat out = net.forward();
 
     Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
-    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1e-5;
-    float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0098 : 1e-3;
+    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1.5e-5;
+    float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 1e-3;
     normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff);
 }
 
@@ -366,6 +366,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
         (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
 
+    double scoresDiff = backend == DNN_BACKEND_INFERENCE_ENGINE ? 2.9e-5 : 1e-5;
     for (int i = 0; i < 2; ++i)
     {
         std::string proto = findDataFile("dnn/" + names[i] + ".pbtxt", false);
@@ -381,7 +382,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
         Mat out = net.forward();
 
         Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/" + names[i] + ".detection_out.npy"));
-        normAssertDetections(ref, out, names[i].c_str(), 0.3);
+        normAssertDetections(ref, out, names[i].c_str(), 0.3, scoresDiff);
     }
 }
 
@@ -406,7 +407,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
     net.setInput(blob);
     Mat out = net.forward();
 
-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : default_l1;
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 1.1e-5;
     double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.021 : default_lInf;
     normAssertDetections(ref, out, "", 0.4, scoreDiff, iouDiff);
 }
@@ -568,10 +569,6 @@ TEST_P(Test_TensorFlow_layers, slice)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
         (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
-#endif
     runTensorFlowNet("slice_4d");
 }
 
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index c63cf26e45..046bd65b86 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -260,6 +260,11 @@ TEST_P(Test_Torch_layers, run_paralel)
 
 TEST_P(Test_Torch_layers, net_residual)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL ||
+                                                    target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
+#endif
     runTorchNet("net_residual", "", false, true);
 }
 
@@ -390,10 +395,6 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
 //   -model models/instance_norm/feathers.t7
 TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
-#endif
     checkBackend();
     std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7",
                             "dnn/fast_neural_style_instance_norm_feathers.t7"};
diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp
index 9076c23545..f1e8b63799 100644
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@@ -197,8 +197,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
     centers.clear();
 
     std::vector < std::vector<Point> > contours;
-    Mat tmpBinaryImage = binaryImage.clone();
-    findContours(tmpBinaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE);
+    findContours(binaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE);
 
 #ifdef DEBUG_BLOB_DETECTOR
     //  Mat keypointsImage;
@@ -214,7 +213,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
     {
         Center center;
         center.confidence = 1;
-        Moments moms = moments(Mat(contours[contourIdx]));
+        Moments moms = moments(contours[contourIdx]);
         if (params.filterByArea)
         {
             double area = moms.m00;
@@ -225,7 +224,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
         if (params.filterByCircularity)
         {
             double area = moms.m00;
-            double perimeter = arcLength(Mat(contours[contourIdx]), true);
+            double perimeter = arcLength(contours[contourIdx], true);
             double ratio = 4 * CV_PI * area / (perimeter * perimeter);
             if (ratio < params.minCircularity || ratio >= params.maxCircularity)
                 continue;
@@ -261,9 +260,9 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
         if (params.filterByConvexity)
         {
             std::vector < Point > hull;
-            convexHull(Mat(contours[contourIdx]), hull);
-            double area = contourArea(Mat(contours[contourIdx]));
-            double hullArea = contourArea(Mat(hull));
+            convexHull(contours[contourIdx], hull);
+            double area = contourArea(contours[contourIdx]);
+            double hullArea = contourArea(hull);
             if (fabs(hullArea) < DBL_EPSILON)
                 continue;
             double ratio = area / hullArea;
diff --git a/modules/imgproc/perf/perf_contours.cpp b/modules/imgproc/perf/perf_contours.cpp
index d3a70cfdd7..bc8b530016 100644
--- a/modules/imgproc/perf/perf_contours.cpp
+++ b/modules/imgproc/perf/perf_contours.cpp
@@ -84,4 +84,26 @@ PERF_TEST_P(TestFindContoursFF, findContours,
     SANITY_CHECK_NOTHING();
 }
 
+typedef TestBaseWithParam< tuple<MatDepth, int> > TestBoundingRect;
+
+PERF_TEST_P(TestBoundingRect, BoundingRect,
+    Combine(
+        testing::Values(CV_32S, CV_32F), // points type
+        Values(400, 511, 1000, 10000, 100000) // points count
+    )
+)
+
+{
+    int ptType = get<0>(GetParam());
+    int n = get<1>(GetParam());
+
+    Mat pts(n, 2, ptType);
+    declare.in(pts, WARMUP_RNG);
+
+    cv::Rect rect;
+    TEST_CYCLE() rect = boundingRect(pts);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } } // namespace
diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp
index 4b2ba97148..d64c49e0a9 100644
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatD
 PERF_TEST_P(Size_MatType_OutMatDepth, integral,
             testing::Combine(
                 testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(CV_8UC1, CV_8UC3, CV_8UC4),
                 testing::Values(CV_32S, CV_32F, CV_64F)
                 )
             )
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 550fdffdb9..538f158b6e 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs
     }
 
     // adjust bufstep so that the used part of the ring buffer stays compact in memory
-    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
+    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
 
     dx1 = std::max(anchor.x - roi.x, 0);
     dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp
index 0878dc456f..a1a75a29e1 100644
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@@ -11,16 +11,6 @@
 
 #include "opencv2/core/softfloat.hpp"
 
-#ifndef CV_ALWAYS_INLINE
-    #if defined(__GNUC__) && (__GNUC__ > 3 ||(__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-        #define CV_ALWAYS_INLINE inline __attribute__((always_inline))
-    #elif defined(_MSC_VER)
-        #define CV_ALWAYS_INLINE __forceinline
-    #else
-        #define CV_ALWAYS_INLINE inline
-    #endif
-#endif
-
 namespace
 {
 
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index 52dc239bc6..5690553b70 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -45,6 +45,7 @@
 #include "opencl_kernels_imgproc.hpp"
 #include <iostream>
 #include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include <opencv2/core/utils/configuration.private.hpp>
 
 /****************************************************************************************\
@@ -97,73 +98,65 @@ struct MorphNoVec
     int operator()(uchar**, int, uchar*, int) const { return 0; }
 };
 
-#if CV_SSE2
+#if CV_SIMD
 
-template<class VecUpdate> struct MorphRowIVec
+template<class VecUpdate> struct MorphRowVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphRowIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
     int operator()(const uchar* src, uchar* dst, int width, int cn) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        cn *= ESZ;
         int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
+        width *= cn;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 16; i += 16 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
         {
-            __m128i s = _mm_loadu_si128((const __m128i*)(src + i));
-            for( k = cn; k < _ksize; k += cn )
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
+            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
+            for (k = cn; k < _ksize; k += cn)
             {
-                __m128i x = _mm_loadu_si128((const __m128i*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
             }
-            _mm_storeu_si128((__m128i*)(dst + i), s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
+            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
         }
-
-        for( ; i < width; i += 4 )
+        if( i <= width - 2*vtype::nlanes )
         {
-            __m128i s = _mm_cvtsi32_si128(*(const int*)(src + i));
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
             for( k = cn; k < _ksize; k += cn )
             {
-                __m128i x = _mm_cvtsi32_si128(*(const int*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
             }
-            *(int*)(dst + i) = _mm_cvtsi128_si32(s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
         }
-
-        return i/ESZ;
-    }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphRowFVec
-{
-    MorphRowFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar* src, uchar* dst, int width, int cn) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
-        VecUpdate updateOp;
-
-        for( i = 0; i < width; i += 4 )
+        if( i <= width - vtype::nlanes )
         {
-            __m128 s = _mm_loadu_ps((const float*)src + i);
+            vtype s = vx_load((const stype*)src + i);
             for( k = cn; k < _ksize; k += cn )
-            {
-                __m128 x = _mm_loadu_ps((const float*)src + i + k);
-                s = updateOp(s, x);
-            }
-            _mm_storeu_ps((float*)dst + i, s);
+                s = updateOp(s, vx_load((const stype*)src + i + k));
+            v_store((stype*)dst + i, s);
+            i += vtype::nlanes;
+        }
+        if( i <= width - vtype::nlanes/2 )
+        {
+            vtype s = vx_load_low((const stype*)src + i);
+            for( k = cn; k < _ksize; k += cn )
+                s = updateOp(s, vx_load_low((const stype*)src + i + k));
+            v_store_low((stype*)dst + i, s);
+            i += vtype::nlanes/2;
         }
 
         return i;
@@ -173,230 +166,156 @@ template<class VecUpdate> struct MorphRowFVec
 };
 
 
-template<class VecUpdate> struct MorphColumnIVec
+template<class VecUpdate> struct MorphColumnVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphColumnIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** src, uchar* dst, int dststep, int count, int width) const
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         int i = 0, k, _ksize = ksize;
-        width *= ESZ;
         VecUpdate updateOp;
 
         for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)src[i] & 15) == 0 );
+            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
+        dststep /= sizeof(dst[0]);
 
         for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
         {
-            for( i = 0; i <= width - 32; i += 32 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
             {
-                const uchar* sptr = src[1] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
 
                 for( k = 2; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                 }
 
                 sptr = src[0] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + i + 16), updateOp(s1, x1));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
 
                 sptr = src[k] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i + 16), updateOp(s1, x1));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
             }
-
-            for( ; i <= width - 8; i += 8 )
+            if( i <= width - 2*vtype::nlanes )
             {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[1] + i)), x0;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
 
                 for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-
-                x0 = _mm_loadl_epi64((const __m128i*)(src[0] + i));
-                _mm_storel_epi64((__m128i*)(dst + i), updateOp(s0, x0));
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                _mm_storel_epi64((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-            }
-        }
-
-        for( ; count > 0; count--, dst += dststep, src++ )
-        {
-            for( i = 0; i <= width - 32; i += 32 )
-            {
-                const uchar* sptr = src[0] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
-
-                for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                 }
-                _mm_storeu_si128((__m128i*)(dst + i), s0);
-                _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
-            }
 
-            for( ; i <= width - 8; i += 8 )
-            {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
+                sptr = src[0] + i;
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
 
-                for( k = 1; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-                _mm_storel_epi64((__m128i*)(dst + i), s0);
+                sptr = src[k] + i;
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                i += 2*vtype::nlanes;
             }
-        }
-
-        return i/ESZ;
-    }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphColumnFVec
-{
-    MorphColumnFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i = 0, k, _ksize = ksize;
-        VecUpdate updateOp;
-
-        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & 15) == 0 );
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
-        dststep /= sizeof(dst[0]);
-
-        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
-        {
-            for( i = 0; i <= width - 16; i += 16 )
+            if( i <= width - vtype::nlanes )
             {
-                const float* sptr = src[1] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                vtype s0 = vx_load_aligned(src[1] + i);
 
                 for( k = 2; k < _ksize; k++ )
-                {
-                    sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
-                }
-
-                sptr = src[0] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + i + 12, updateOp(s3, x3));
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));
 
-                sptr = src[k] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + dststep + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + dststep + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + dststep + i + 12, updateOp(s3, x3));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
+                i += vtype::nlanes;
             }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - vtype::nlanes/2 )
             {
-                __m128 s0 = _mm_load_ps(src[1] + i), x0;
+                vtype s0 = vx_load_low(src[1] + i);
 
                 for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
-                }
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));
 
-                x0 = _mm_load_ps(src[0] + i);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                x0 = _mm_load_ps(src[k] + i);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
+                v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
+                v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
+                i += vtype::nlanes/2;
             }
         }
 
         for( ; count > 0; count--, dst += dststep, src++ )
         {
-            for( i = 0; i <= width - 16; i += 16 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
             {
-                const float* sptr = src[0] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
 
                 for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                 }
-                _mm_storeu_ps(dst + i, s0);
-                _mm_storeu_ps(dst + i + 4, s1);
-                _mm_storeu_ps(dst + i + 8, s2);
-                _mm_storeu_ps(dst + i + 12, s3);
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                v_store(dst + i + 2*vtype::nlanes, s2);
+                v_store(dst + i + 3*vtype::nlanes, s3);
             }
-
-            for( i = 0; i <= width - 4; i += 4 )
+            if( i <= width - 2*vtype::nlanes )
             {
-                __m128 s0 = _mm_load_ps(src[0] + i), x0;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+
                 for( k = 1; k < _ksize; k++ )
                 {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
+                    sptr = src[k] + i;
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                 }
-                _mm_storeu_ps(dst + i, s0);
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                i += 2*vtype::nlanes;
+            }
+            if( i <= width - vtype::nlanes )
+            {
+                vtype s0 = vx_load_aligned(src[0] + i);
+
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));
+                v_store(dst + i, s0);
+                i += vtype::nlanes;
+            }
+            if( i <= width - vtype::nlanes/2 )
+            {
+                vtype s0 = vx_load_low(src[0] + i);
+
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));
+                v_store_low(dst + i, s0);
+                i += vtype::nlanes/2;
             }
         }
 
@@ -407,185 +326,109 @@ template<class VecUpdate> struct MorphColumnFVec
 };
 
 
-template<class VecUpdate> struct MorphIVec
+template<class VecUpdate> struct MorphVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    int operator()(uchar** src, int nz, uchar* dst, int width) const
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
         int i, k;
-        width *= ESZ;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 32; i += 32 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
         {
-            const uchar* sptr = src[0] + i;
-            __m128i s0 = _mm_loadu_si128((const __m128i*)sptr);
-            __m128i s1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-            __m128i x0, x1;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
+            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
-                x0 = _mm_loadu_si128((const __m128i*)sptr);
-                x1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-            }
-            _mm_storeu_si128((__m128i*)(dst + i), s0);
-            _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
-        }
-
-        for( ; i <= width - 8; i += 8 )
-        {
-            __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
-
-            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                s0 = updateOp(s0, x0);
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
             }
-            _mm_storel_epi64((__m128i*)(dst + i), s0);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            v_store(dst + i + 2*vtype::nlanes, s2);
+            v_store(dst + i + 3*vtype::nlanes, s3);
         }
-
-        return i/ESZ;
-    }
-};
-
-
-template<class VecUpdate> struct MorphFVec
-{
-    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
-        int i, k;
-        VecUpdate updateOp;
-
-        for( i = 0; i <= width - 16; i += 16 )
+        if( i <= width - 2*vtype::nlanes )
         {
-            const float* sptr = src[0] + i;
-            __m128 s0 = _mm_loadu_ps(sptr);
-            __m128 s1 = _mm_loadu_ps(sptr + 4);
-            __m128 s2 = _mm_loadu_ps(sptr + 8);
-            __m128 s3 = _mm_loadu_ps(sptr + 12);
-            __m128 x0, x1, x2, x3;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
-                x0 = _mm_loadu_ps(sptr);
-                x1 = _mm_loadu_ps(sptr + 4);
-                x2 = _mm_loadu_ps(sptr + 8);
-                x3 = _mm_loadu_ps(sptr + 12);
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-                s2 = updateOp(s2, x2);
-                s3 = updateOp(s3, x3);
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
             }
-            _mm_storeu_ps(dst + i, s0);
-            _mm_storeu_ps(dst + i + 4, s1);
-            _mm_storeu_ps(dst + i + 8, s2);
-            _mm_storeu_ps(dst + i + 12, s3);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
         }
-
-        for( ; i <= width - 4; i += 4 )
+        if( i <= width - vtype::nlanes )
         {
-            __m128 s0 = _mm_loadu_ps(src[0] + i), x0;
-
+            vtype s0 = vx_load(src[0] + i);
             for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_loadu_ps(src[k] + i);
-                s0 = updateOp(s0, x0);
-            }
-            _mm_storeu_ps(dst + i, s0);
+                s0 = updateOp(s0, vx_load(src[k] + i));
+            v_store(dst + i, s0);
+            i += vtype::nlanes;
         }
-
-        for( ; i < width; i++ )
+        if( i <= width - vtype::nlanes/2 )
         {
-            __m128 s0 = _mm_load_ss(src[0] + i), x0;
-
+            vtype s0 = vx_load_low(src[0] + i);
             for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_load_ss(src[k] + i);
-                s0 = updateOp(s0, x0);
-            }
-            _mm_store_ss(dst + i, s0);
+                s0 = updateOp(s0, vx_load_low(src[k] + i));
+            v_store_low(dst + i, s0);
+            i += vtype::nlanes/2;
         }
-
         return i;
     }
 };
 
-struct VMin8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }
-};
-struct VMax8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }
-};
-struct VMin16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); }
-};
-struct VMax16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_adds_epu16(_mm_subs_epu16(a,b), b); }
-};
-struct VMin16s
+template <typename T> struct VMin
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_min_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); }
 };
-struct VMax16s
+template <typename T> struct VMax
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_max_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); }
 };
-struct VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }};
-struct VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }};
-
-typedef MorphRowIVec<VMin8u> ErodeRowVec8u;
-typedef MorphRowIVec<VMax8u> DilateRowVec8u;
-typedef MorphRowIVec<VMin16u> ErodeRowVec16u;
-typedef MorphRowIVec<VMax16u> DilateRowVec16u;
-typedef MorphRowIVec<VMin16s> ErodeRowVec16s;
-typedef MorphRowIVec<VMax16s> DilateRowVec16s;
-typedef MorphRowFVec<VMin32f> ErodeRowVec32f;
-typedef MorphRowFVec<VMax32f> DilateRowVec32f;
-
-typedef MorphColumnIVec<VMin8u> ErodeColumnVec8u;
-typedef MorphColumnIVec<VMax8u> DilateColumnVec8u;
-typedef MorphColumnIVec<VMin16u> ErodeColumnVec16u;
-typedef MorphColumnIVec<VMax16u> DilateColumnVec16u;
-typedef MorphColumnIVec<VMin16s> ErodeColumnVec16s;
-typedef MorphColumnIVec<VMax16s> DilateColumnVec16s;
-typedef MorphColumnFVec<VMin32f> ErodeColumnVec32f;
-typedef MorphColumnFVec<VMax32f> DilateColumnVec32f;
-
-typedef MorphIVec<VMin8u> ErodeVec8u;
-typedef MorphIVec<VMax8u> DilateVec8u;
-typedef MorphIVec<VMin16u> ErodeVec16u;
-typedef MorphIVec<VMax16u> DilateVec16u;
-typedef MorphIVec<VMin16s> ErodeVec16s;
-typedef MorphIVec<VMax16s> DilateVec16s;
-typedef MorphFVec<VMin32f> ErodeVec32f;
-typedef MorphFVec<VMax32f> DilateVec32f;
+
+typedef MorphRowVec<VMin<v_uint8> > ErodeRowVec8u;
+typedef MorphRowVec<VMax<v_uint8> > DilateRowVec8u;
+typedef MorphRowVec<VMin<v_uint16> > ErodeRowVec16u;
+typedef MorphRowVec<VMax<v_uint16> > DilateRowVec16u;
+typedef MorphRowVec<VMin<v_int16> > ErodeRowVec16s;
+typedef MorphRowVec<VMax<v_int16> > DilateRowVec16s;
+typedef MorphRowVec<VMin<v_float32> > ErodeRowVec32f;
+typedef MorphRowVec<VMax<v_float32> > DilateRowVec32f;
+
+typedef MorphColumnVec<VMin<v_uint8> > ErodeColumnVec8u;
+typedef MorphColumnVec<VMax<v_uint8> > DilateColumnVec8u;
+typedef MorphColumnVec<VMin<v_uint16> > ErodeColumnVec16u;
+typedef MorphColumnVec<VMax<v_uint16> > DilateColumnVec16u;
+typedef MorphColumnVec<VMin<v_int16> > ErodeColumnVec16s;
+typedef MorphColumnVec<VMax<v_int16> > DilateColumnVec16s;
+typedef MorphColumnVec<VMin<v_float32> > ErodeColumnVec32f;
+typedef MorphColumnVec<VMax<v_float32> > DilateColumnVec32f;
+
+typedef MorphVec<VMin<v_uint8> > ErodeVec8u;
+typedef MorphVec<VMax<v_uint8> > DilateVec8u;
+typedef MorphVec<VMin<v_uint16> > ErodeVec16u;
+typedef MorphVec<VMax<v_uint16> > DilateVec16u;
+typedef MorphVec<VMin<v_int16> > ErodeVec16s;
+typedef MorphVec<VMax<v_int16> > DilateVec16s;
+typedef MorphVec<VMin<v_float32> > ErodeVec32f;
+typedef MorphVec<VMax<v_float32> > DilateVec32f;
 
 #else
 
diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index d505fde4fc..436c74eade 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -39,6 +39,8 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
 namespace cv
 {
 
@@ -746,109 +748,161 @@ static Rect pointSetBoundingRect( const Mat& points )
     if( npoints == 0 )
         return Rect();
 
-    const Point* pts = points.ptr<Point>();
-    Point pt = pts[0];
+#if CV_SIMD
+    const int64_t* pts = points.ptr<int64_t>();
 
-#if CV_SSE4_2
-    if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
+    if( !is_float )
     {
-        if( !is_float )
+        v_int32 minval, maxval;
+        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
         {
-            __m128i minval, maxval;
-            minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y
-
-            for( i = 1; i < npoints; i++ )
-            {
-                __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]);
-                minval = _mm_min_epi32(ptXY, minval);
-                maxval = _mm_max_epi32(ptXY, maxval);
-            }
-            xmin = _mm_cvtsi128_si32(minval);
-            ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
-            xmax = _mm_cvtsi128_si32(maxval);
-            ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
+            v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
         }
-        else
+        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - v_int32::nlanes/4 )
         {
-            __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
-            minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));
-
-            for( i = 1; i < npoints; i++ )
+            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += v_int64::nlanes/2;
+        }
+        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        {
+            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = minval.get0();
+        xmax = maxval.get0();
+        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
+        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
+        {
+            v_int32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+            for( i++; i < npoints; i++ )
             {
-                ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]);
-
-                minvalf = _mm_min_ps(minvalf, ptXY);
-                maxvalf = _mm_max_ps(maxvalf, ptXY);
+                v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
             }
-
-            float xyminf[2], xymaxf[2];
-            _mm_storel_pi((__m64*)xyminf, minvalf);
-            _mm_storel_pi((__m64*)xymaxf, maxvalf);
-            xmin = cvFloor(xyminf[0]);
-            ymin = cvFloor(xyminf[1]);
-            xmax = cvFloor(xymaxf[0]);
-            ymax = cvFloor(xymaxf[1]);
+            xmin = min(xmin, minval2.get0());
+            xmax = max(xmax, maxval2.get0());
+            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
+            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
         }
+#endif
     }
     else
-#endif
     {
-        if( !is_float )
+        v_float32 minval, maxval;
+        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
         {
-            xmin = xmax = pt.x;
-            ymin = ymax = pt.y;
-
-            for( i = 1; i < npoints; i++ )
+            v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
+        }
+        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - v_float32::nlanes/4 )
+        {
+            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += v_float32::nlanes/4;
+        }
+        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        {
+            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = cvFloor(minval.get0());
+        xmax = cvFloor(maxval.get0());
+        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
+        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
+        {
+            v_float32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+            for( i++; i < npoints; i++ )
             {
-                pt = pts[i];
+                v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
+            }
+            xmin = min(xmin, cvFloor(minval2.get0()));
+            xmax = max(xmax, cvFloor(maxval2.get0()));
+            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
+            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
+        }
+#endif
+    }
+#else
+    const Point* pts = points.ptr<Point>();
+    Point pt = pts[0];
 
-                if( xmin > pt.x )
-                    xmin = pt.x;
+    if( !is_float )
+    {
+        xmin = xmax = pt.x;
+        ymin = ymax = pt.y;
 
-                if( xmax < pt.x )
-                    xmax = pt.x;
+        for( i = 1; i < npoints; i++ )
+        {
+            pt = pts[i];
 
-                if( ymin > pt.y )
-                    ymin = pt.y;
+            if( xmin > pt.x )
+                xmin = pt.x;
 
-                if( ymax < pt.y )
-                    ymax = pt.y;
-            }
-        }
-        else
-        {
-            Cv32suf v;
-            // init values
-            xmin = xmax = CV_TOGGLE_FLT(pt.x);
-            ymin = ymax = CV_TOGGLE_FLT(pt.y);
+            if( xmax < pt.x )
+                xmax = pt.x;
 
-            for( i = 1; i < npoints; i++ )
-            {
-                pt = pts[i];
-                pt.x = CV_TOGGLE_FLT(pt.x);
-                pt.y = CV_TOGGLE_FLT(pt.y);
+            if( ymin > pt.y )
+                ymin = pt.y;
 
-                if( xmin > pt.x )
-                    xmin = pt.x;
+            if( ymax < pt.y )
+                ymax = pt.y;
+        }
+    }
+    else
+    {
+        Cv32suf v;
+        // init values
+        xmin = xmax = CV_TOGGLE_FLT(pt.x);
+        ymin = ymax = CV_TOGGLE_FLT(pt.y);
 
-                if( xmax < pt.x )
-                    xmax = pt.x;
+        for( i = 1; i < npoints; i++ )
+        {
+            pt = pts[i];
+            pt.x = CV_TOGGLE_FLT(pt.x);
+            pt.y = CV_TOGGLE_FLT(pt.y);
 
-                if( ymin > pt.y )
-                    ymin = pt.y;
+            if( xmin > pt.x )
+                xmin = pt.x;
 
-                if( ymax < pt.y )
-                    ymax = pt.y;
-            }
+            if( xmax < pt.x )
+                xmax = pt.x;
+
+            if( ymin > pt.y )
+                ymin = pt.y;
 
-            v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
-            v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
-            // because right and bottom sides of the bounding rectangle are not inclusive
-            // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
-            v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
-            v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
+            if( ymax < pt.y )
+                ymax = pt.y;
         }
+
+        v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
+        v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
+        // because right and bottom sides of the bounding rectangle are not inclusive
+        // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
+        v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
+        v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
     }
+#endif
 
     return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
 }
diff --git a/modules/imgproc/src/sumpixels.avx512_skx.cpp b/modules/imgproc/src/sumpixels.avx512_skx.cpp
new file mode 100644
index 0000000000..7e5cbdcf88
--- /dev/null
+++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp
@@ -0,0 +1,262 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#include "precomp.hpp"
+#include "sumpixels.hpp"
+
+namespace cv {
+namespace { // Anonymous namespace to avoid exposing the implementation classes
+
+//
+// NOTE: Look at the bottom of the file for the entry-point function for external callers
+//
+
+// At the moment only 3 channel support untilted is supported
+// More channel support coming soon.
+// TODO: Add support for sqsum and 1,2, and 4 channels
+class IntegralCalculator_3Channel {
+public:
+    IntegralCalculator_3Channel() {};
+
+
+    void calculate_integral_avx512(const uchar *src, size_t _srcstep,
+                                   double *sum,      size_t _sumstep,
+                                   double *sqsum,    size_t _sqsumstep,
+                                   int width, int height, int cn)
+    {
+        const int srcstep = (int)(_srcstep/sizeof(uchar));
+        const int sumstep = (int)(_sumstep/sizeof(double));
+        const int sqsumstep = (int)(_sqsumstep/sizeof(double));
+        const int ops_per_line = width * cn;
+
+        // Clear the first line of the sum as per spec (see integral documentation)
+        // Also adjust the index of sum and sqsum to be at the real 0th element
+        // and not point to the border pixel so it stays in sync with the src pointer
+        memset( sum, 0, (ops_per_line+cn)*sizeof(double));
+        sum += cn;
+
+        if (sqsum) {
+            memset( sqsum, 0, (ops_per_line+cn)*sizeof(double));
+            sqsum += cn;
+        }
+
+        // Now calculate the integral over the whole image one line at a time
+        for(int y = 0; y < height; y++) {
+            const uchar * src_line    = &src[y*srcstep];
+            double      * sum_above   = &sum[y*sumstep];
+            double      * sum_line    = &sum_above[sumstep];
+            double      * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep]     : NULL;
+            double      * sqsum_line  = (sqsum) ? &sqsum_above[sqsumstep] : NULL;
+
+            integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);
+
+        }
+    }
+
+    static inline
+    void integral_line_3channel_avx512(const uchar *srcs,
+                                       double *sums,   double *sums_above,
+                                       double *sqsums, double *sqsums_above,
+                                       int num_ops_in_line)
+    {
+        __m512i sum_accumulator   = _mm512_setzero_si512();  // holds rolling sums for the line
+        __m512i sqsum_accumulator = _mm512_setzero_si512();  // holds rolling sqsums for the line
+
+        // The first element on each line must be zeroes as per spec (see integral documentation)
+        set_border_pixel_value(sums, sqsums);
+
+        // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
+        aligned_integral(     srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+        post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+
+    }
+
+
+    static inline
+    void set_border_pixel_value(double *sums, double *sqsums)
+    {
+        // Sets the border pixel value to 0s.
+        // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now
+        __m512i zeroes = _mm512_setzero_si512();
+
+        _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes);
+        if (sqsums)
+            _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes);
+    }
+
+
+    static inline
+    void aligned_integral(const uchar *&srcs,
+                          double *&sums,  double *&sums_above,
+                          double *&sqsum, double *&sqsum_above,
+                          __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                          int num_ops_in_line)
+    {
+        // This function handles full 64 byte chunks of the source data at a time until it gets to the part of
+        // the line that no longer contains a full 64 byte chunk.  Other code will handle the last part.
+
+        const int num_chunks = num_ops_in_line >> 6;  // quick int divide by 64
+
+        for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){
+            integral_64_operations_avx512((__m512i *) srcs,
+                                          (__m512i *) sums,  (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator);
+            srcs+=64; sums+=64; sums_above+=64;
+            if (sqsum){ sqsum+= 64; sqsum_above+=64; }
+        }
+    }
+
+
+    static inline
+    void post_aligned_integral(const uchar *srcs,
+                               const double *sums,   const double *sums_above,
+                               const double *sqsum,  const double *sqsum_above,
+                               __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                               int num_ops_in_line)
+    {
+        // This function handles the last few straggling operations that are not a full chunk of 64 operations
+        // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).
+
+        const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F;  // Quick int modulo 64
+
+        if (num_operations > 0) {
+            __mmask64 operation_mask = (1ULL << num_operations) - 1ULL;
+
+            integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          operation_mask, sum_accumulator, sqsum_accumulator);
+        }
+    }
+
+
+    static inline
+    void integral_64_operations_avx512(const __m512i *srcs,
+                                       __m512i *sums,       const __m512i *sums_above,
+                                       __m512i *sqsums,     const __m512i *sqsums_above,
+                                       __mmask64 data_mask,
+                                       __m512i &sum_accumulator, __m512i &sqsum_accumulator)
+    {
+        __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);
+
+        for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) {
+            __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data
+
+            for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) {
+
+                __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes);
+
+                // Calculate integral for the sum on the 8 entries
+                integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator);
+                sums++; sums_above++;
+
+                if (sqsums){ // Calculate integral for the sum on the 8 entries
+                    __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs);
+
+                    integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator);
+                    sqsums++; sqsums_above++;
+                }
+
+                // Prepare for next iteration of inner loop
+                // shift source to align next 8 bytes to lane 0 and shift the mask
+                src_16bytes = shift_right_8_bytes(src_16bytes);
+                data_mask = data_mask >> 8;
+
+            }
+
+            // Prepare for next iteration of outer loop
+            src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);
+        }
+    }
+
+
+    static inline
+    void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
+                               __m512i *results_ptr, __m512i &accumulator)
+     {
+        _mm512_mask_storeu_pd(
+                results_ptr,   // Store the result here
+                data_mask,     // Using the data mask to avoid overrunning the line
+                calculate_integral( // Writing the value of the integral derived from:
+                        src_longs,                                           // input data
+                        _mm512_maskz_loadu_pd(data_mask, above_values_ptr),  // and the results from line above
+                        accumulator                                          // keeping track of the accumulator
+                )
+        );
+    }
+
+
+    static inline
+    __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+    {
+        __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
+
+        // Align data to prepare for the adds:
+        //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+        //   Vertical LANES:     76543210
+        //   src_longs       :   HGFEDCBA
+        //   shited3lanes    : + EDCBA
+        //   shifted6lanes   : + BA
+        //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
+        //                     = integral
+        __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
+        __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
+        __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+        // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator)
+        __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes);
+        __m512i sum_src_carry  = _mm512_add_epi64(src_longs, carry_over);
+        accumulator            = _mm512_add_epi64(sum_shift3and6, sum_src_carry);
+
+        // Convert to packed double and add to the line above to get the true integral value
+        __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+        __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+        return integral_pd;
+    }
+
+
+    static inline
+    __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask)  {
+        return _mm512_maskz_loadu_epi8(data_mask, srcs);
+    }
+
+
+    static inline
+    __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes)  {
+        return _mm512_cvtepu8_epi64(src_16bytes);
+    }
+
+
+    static inline
+    __m128i shift_right_8_bytes(__m128i src_16bytes)  {
+        return _mm_maskz_compress_epi64(2, src_16bytes);
+    }
+
+
+    static inline
+    __m512i shift_right_16_bytes(__m512i src_64byte_chunk)  {
+        return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
+    }
+
+};
+} // end of anonymous namespace
+
+namespace opt_AVX512_SKX {
+
+// This is the implementation for the external callers interface entry point.
+// It should be the only function called into this file from outside
+// Any new implementations should be directed from here
+void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
+                               double      *sum,   size_t _sumstep,
+                               double      *sqsum, size_t _sqsumstep,
+                               int width, int height, int cn)
+{
+    IntegralCalculator_3Channel  calculator;
+    calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn);
+}
+
+
+} // end namespace opt_AVX512_SXK
+} // end namespace cv
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
index 3c49aaf773..ae7647b8bd 100755
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -10,7 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
@@ -44,6 +44,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "sumpixels.hpp"
 
 
 namespace cv
@@ -62,6 +63,37 @@ struct Integral_SIMD
     }
 };
 
+
+template <>
+struct Integral_SIMD<uchar, double, double> {
+    Integral_SIMD() {};
+
+
+    bool operator()(const uchar *src, size_t _srcstep,
+                    double *sum,      size_t _sumstep,
+                    double *sqsum,    size_t _sqsumstep,
+                    double *tilted,   size_t _tiltedstep,
+                    int width, int height, int cn) const
+    {
+#if CV_TRY_AVX512_SKX
+        CV_UNUSED(_tiltedstep);
+        // TODO: Add support for 1,2, and 4 channels
+        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){
+            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
+                                                      sqsum, _sqsumstep, width, height, cn);
+            return true;
+        }
+#else
+        // Avoid warnings in some builds
+        CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep);
+        CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep);
+        CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn);
+#endif
+        return false;
+    }
+
+};
+
 #if CV_SIMD && CV_SIMD_WIDTH <= 64
 
 template <>
diff --git a/modules/imgproc/src/sumpixels.hpp b/modules/imgproc/src/sumpixels.hpp
new file mode 100644
index 0000000000..8d5ab0a851
--- /dev/null
+++ b/modules/imgproc/src/sumpixels.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
+#define OPENCV_IMGPROC_SUM_PIXELS_HPP
+
+namespace cv
+{
+
+namespace opt_AVX512_SKX
+{
+#if CV_TRY_AVX512_SKX
+    void calculate_integral_avx512(
+            const uchar *src, size_t _srcstep,
+            double *sum,      size_t _sumstep,
+            double *sqsum,    size_t _sqsumstep,
+            int width, int height, int cn);
+
+#endif
+} // end namespace opt_AVX512_SKX
+} // end namespace cv
+
+#endif
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 7c5bb163f6..157a83b603 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -190,82 +190,78 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     int j = 0;
     const uchar* src = _src.ptr();
     uchar* dst = _dst.ptr();
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
-        v_uint8x16 thresh_u = v_setall_u8( thresh );
-        v_uint8x16 maxval16 = v_setall_u8( maxval );
+#if CV_SIMD
+    v_uint8 thresh_u = vx_setall_u8( thresh );
+    v_uint8 maxval16 = vx_setall_u8( maxval );
 
-        switch( type )
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-        case THRESH_BINARY:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
             {
-                for( j = 0; j <= roi.width - 16; j += 16 )
-                {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
-                    v0 = thresh_u < v0;
-                    v0 = v0 & maxval16;
-                    v_store( dst + j, v0 );
-                }
+                v_uint8 v0;
+                v0 = vx_load( src + j );
+                v0 = thresh_u < v0;
+                v0 = v0 & maxval16;
+                v_store( dst + j, v0 );
             }
-            break;
+        }
+        break;
 
-        case THRESH_BINARY_INV:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+    case THRESH_BINARY_INV:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
             {
-                for( j = 0; j <= roi.width - 16; j += 16 )
-                {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
-                    v0 = v0 <= thresh_u;
-                    v0 = v0 & maxval16;
-                    v_store( dst + j, v0 );
-                }
+                v_uint8 v0;
+                v0 = vx_load( src + j );
+                v0 = v0 <= thresh_u;
+                v0 = v0 & maxval16;
+                v_store( dst + j, v0 );
             }
-            break;
+        }
+        break;
 
-        case THRESH_TRUNC:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+    case THRESH_TRUNC:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
             {
-                for( j = 0; j <= roi.width - 16; j += 16 )
-                {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
-                    v0 = v0 - ( v0 - thresh_u );
-                    v_store( dst + j, v0 );
-                }
+                v_uint8 v0;
+                v0 = vx_load( src + j );
+                v0 = v0 - ( v0 - thresh_u );
+                v_store( dst + j, v0 );
             }
-            break;
+        }
+        break;
 
-        case THRESH_TOZERO:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+    case THRESH_TOZERO:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
             {
-                for( j = 0; j <= roi.width - 16; j += 16 )
-                {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
-                    v0 = ( thresh_u < v0 ) & v0;
-                    v_store( dst + j, v0 );
-                }
+                v_uint8 v0;
+                v0 = vx_load( src + j );
+                v0 = ( thresh_u < v0 ) & v0;
+                v_store( dst + j, v0 );
             }
-            break;
+        }
+        break;
 
-        case THRESH_TOZERO_INV:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+    case THRESH_TOZERO_INV:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
             {
-                for( j = 0; j <= roi.width - 16; j += 16 )
-                {
-                    v_uint8x16 v0;
-                    v0 = v_load( src + j );
-                    v0 = ( v0 <= thresh_u ) & v0;
-                    v_store( dst + j, v0 );
-                }
+                v_uint8 v0;
+                v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh_u ) & v0;
+                v_store( dst + j, v0 );
             }
-            break;
         }
+        break;
     }
 #endif
 
@@ -355,125 +351,156 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
 
     const ushort* src = _src.ptr<ushort>();
     ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
-    if (useSIMD)
-    {
-        int i, j;
-        v_uint16x8 thresh_u = v_setall_u16(thresh);
-        v_uint16x8 maxval16 = v_setall_u16(maxval);
+#if CV_SIMD
+    int i, j;
+    v_uint16 thresh_u = vx_setall_u16(thresh);
+    v_uint16 maxval16 = vx_setall_u16(maxval);
 
-        switch (type)
+    switch (type)
+    {
+    case THRESH_BINARY:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
-        case THRESH_BINARY:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
             {
-                for (j = 0; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = thresh_u < v0;
-                    v1 = thresh_u < v1;
-                    v0 = v0 & maxval16;
-                    v1 = v1 & maxval16;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-
-                for (; j < roi.width; j++)
-                    dst[j] = threshBinary<ushort>(src[j], thresh, maxval);
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = thresh_u < v0;
+                v1 = thresh_u < v1;
+                v0 = v0 & maxval16;
+                v1 = v1 & maxval16;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
             }
-            break;
-
-        case THRESH_BINARY_INV:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+            if (j <= roi.width - v_uint16::nlanes)
             {
-                j = 0;
-                for (; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = v0 <= thresh_u;
-                    v1 = v1 <= thresh_u;
-                    v0 = v0 & maxval16;
-                    v1 = v1 & maxval16;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-
-                for (; j < roi.width; j++)
-                    dst[j] = threshBinaryInv<ushort>(src[j], thresh, maxval);
+                v_uint16 v0 = vx_load(src + j);
+                v0 = thresh_u < v0;
+                v0 = v0 & maxval16;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
             }
-            break;
 
-        case THRESH_TRUNC:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
-            {
-                j = 0;
-                for (; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = v_min(v0, thresh_u);
-                    v1 = v_min(v1, thresh_u);
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
+            for (; j < roi.width; j++)
+                dst[j] = threshBinary<ushort>(src[j], thresh, maxval);
+        }
+        break;
 
-                for (; j < roi.width; j++)
-                    dst[j] = threshTrunc<ushort>(src[j], thresh);
+    case THRESH_BINARY_INV:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = v0 <= thresh_u;
+                v1 = v1 <= thresh_u;
+                v0 = v0 & maxval16;
+                v1 = v1 & maxval16;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
             }
-            break;
-
-        case THRESH_TOZERO:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+            if (j <= roi.width - v_uint16::nlanes)
             {
-                j = 0;
-                for (; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = (thresh_u < v0) & v0;
-                    v1 = (thresh_u < v1) & v1;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
+                v_uint16 v0 = vx_load(src + j);
+                v0 = v0 <= thresh_u;
+                v0 = v0 & maxval16;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
+            }
 
-                for (; j < roi.width; j++)
-                    dst[j] = threshToZero<ushort>(src[j], thresh);
+            for (; j < roi.width; j++)
+                dst[j] = threshBinaryInv<ushort>(src[j], thresh, maxval);
+        }
+        break;
+
+    case THRESH_TRUNC:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = v_min(v0, thresh_u);
+                v1 = v_min(v1, thresh_u);
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = v_min(v0, thresh_u);
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
             }
-            break;
 
-        case THRESH_TOZERO_INV:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+            for (; j < roi.width; j++)
+                dst[j] = threshTrunc<ushort>(src[j], thresh);
+        }
+        break;
+
+    case THRESH_TOZERO:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
             {
-                j = 0;
-                for (; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = (v0 <= thresh_u) & v0;
-                    v1 = (v1 <= thresh_u) & v1;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = (thresh_u < v0) & v0;
+                v1 = (thresh_u < v1) & v1;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (thresh_u < v0) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
+            }
 
-                for (; j < roi.width; j++)
-                    dst[j] = threshToZeroInv<ushort>(src[j], thresh);
+            for (; j < roi.width; j++)
+                dst[j] = threshToZero<ushort>(src[j], thresh);
+        }
+        break;
+
+    case THRESH_TOZERO_INV:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = (v0 <= thresh_u) & v0;
+                v1 = (v1 <= thresh_u) & v1;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
             }
-            break;
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (v0 <= thresh_u) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
+            }
+
+            for (; j < roi.width; j++)
+                dst[j] = threshToZeroInv<ushort>(src[j], thresh);
         }
+        break;
     }
-    else
+#else
+    threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
-    {
-        threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
 }
 
 static void
@@ -544,128 +571,159 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
     }
 #endif
 
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
-        int i, j;
-        v_int16x8 thresh8 = v_setall_s16( thresh );
-        v_int16x8 maxval8 = v_setall_s16( maxval );
+#if CV_SIMD
+    int i, j;
+    v_int16 thresh8 = vx_setall_s16( thresh );
+    v_int16 maxval8 = vx_setall_s16( maxval );
 
-        switch( type )
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-        case THRESH_BINARY:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
             {
-                j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = thresh8 < v0;
-                    v1 = thresh8 < v1;
-                    v0 = v0 & maxval8;
-                    v1 = v1 & maxval8;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinary<short>(src[j], thresh, maxval);
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = thresh8 < v0;
+                v1 = thresh8 < v1;
+                v0 = v0 & maxval8;
+                v1 = v1 & maxval8;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
             }
-            break;
-
-        case THRESH_BINARY_INV:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            if( j <= roi.width - v_int16::nlanes )
             {
-                j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = v0 <= thresh8;
-                    v1 = v1 <= thresh8;
-                    v0 = v0 & maxval8;
-                    v1 = v1 & maxval8;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinaryInv<short>(src[j], thresh, maxval);
+                v_int16 v0 = vx_load( src + j );
+                v0 = thresh8 < v0;
+                v0 = v0 & maxval8;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
             }
-            break;
 
-        case THRESH_TRUNC:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-            {
-                j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = v_min( v0, thresh8 );
-                    v1 = v_min( v1, thresh8 );
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinary<short>(src[j], thresh, maxval);
+        }
+        break;
 
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshTrunc<short>( src[j], thresh );
+    case THRESH_BINARY_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = v0 <= thresh8;
+                v1 = v1 <= thresh8;
+                v0 = v0 & maxval8;
+                v1 = v1 & maxval8;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
             }
-            break;
-
-        case THRESH_TOZERO:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            if( j <= roi.width - v_int16::nlanes )
             {
-                j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = ( thresh8 < v0 ) & v0;
-                    v1 = ( thresh8 < v1 ) & v1;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
+                v_int16 v0 = vx_load( src + j );
+                v0 = v0 <= thresh8;
+                v0 = v0 & maxval8;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
+            }
 
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshToZero<short>(src[j], thresh);
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinaryInv<short>(src[j], thresh, maxval);
+        }
+        break;
+
+    case THRESH_TRUNC:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = v_min( v0, thresh8 );
+                v1 = v_min( v1, thresh8 );
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh8 );
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
             }
-            break;
 
-        case THRESH_TOZERO_INV:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            for( ; j < roi.width; j++ )
+                dst[j] = threshTrunc<short>( src[j], thresh );
+        }
+        break;
+
+    case THRESH_TOZERO:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
             {
-                j = 0;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = ( v0 <= thresh8 ) & v0;
-                    v1 = ( v1 <= thresh8 ) & v1;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = ( thresh8 < v0 ) & v0;
+                v1 = ( thresh8 < v1 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( thresh8 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
+            }
 
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshToZeroInv<short>(src[j], thresh);
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZero<short>(src[j], thresh);
+        }
+        break;
+
+    case THRESH_TOZERO_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = ( v0 <= thresh8 ) & v0;
+                v1 = ( v1 <= thresh8 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
             }
-            break;
-        default:
-            CV_Error( CV_StsBadArg, "" ); return;
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh8 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZeroInv<short>(src[j], thresh);
         }
+        break;
+    default:
+        CV_Error( CV_StsBadArg, "" ); return;
     }
-    else
+#else
+    threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
-    {
-        threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
 }
 
 
@@ -719,175 +777,40 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
     }
 #endif
 
-#if CV_SIMD128
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
-    {
-        int i, j;
-        v_float32x4 thresh4 = v_setall_f32( thresh );
-        v_float32x4 maxval4 = v_setall_f32( maxval );
-
-        switch( type )
-        {
-            case THRESH_BINARY:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = thresh4 < v0;
-                        v1 = thresh4 < v1;
-                        v0 = v0 & maxval4;
-                        v1 = v1 & maxval4;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshBinary<float>(src[j], thresh, maxval);
-                }
-                break;
-
-            case THRESH_BINARY_INV:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = v0 <= thresh4;
-                        v1 = v1 <= thresh4;
-                        v0 = v0 & maxval4;
-                        v1 = v1 & maxval4;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshBinaryInv<float>(src[j], thresh, maxval);
-                }
-                break;
-
-            case THRESH_TRUNC:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = v_min( v0, thresh4 );
-                        v1 = v_min( v1, thresh4 );
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshTrunc<float>(src[j], thresh);
-                }
-                break;
-
-            case THRESH_TOZERO:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = ( thresh4 < v0 ) & v0;
-                        v1 = ( thresh4 < v1 ) & v1;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshToZero<float>(src[j], thresh);
-                }
-                break;
-
-            case THRESH_TOZERO_INV:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = ( v0 <= thresh4 ) & v0;
-                        v1 = ( v1 <= thresh4 ) & v1;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshToZeroInv<float>(src[j], thresh);
-                }
-                break;
-            default:
-                CV_Error( CV_StsBadArg, "" ); return;
-        }
-    }
-    else
-#endif
-    {
-        threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
-}
-
-static void
-thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
-{
-    Size roi = _src.size();
-    roi.width *= _src.channels();
-    const double* src = _src.ptr<double>();
-    double* dst = _dst.ptr<double>();
-    size_t src_step = _src.step / sizeof(src[0]);
-    size_t dst_step = _dst.step / sizeof(dst[0]);
-
-    if (_src.isContinuous() && _dst.isContinuous())
-    {
-        roi.width *= roi.height;
-        roi.height = 1;
-    }
+#if CV_SIMD
+    int i, j;
+    v_float32 thresh4 = vx_setall_f32( thresh );
+    v_float32 maxval4 = vx_setall_f32( maxval );
 
-#if CV_SIMD128_64F
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
+    switch( type )
     {
-        int i, j;
-        v_float64x2 thresh2 = v_setall_f64( thresh );
-        v_float64x2 maxval2 = v_setall_f64( maxval );
-
-        switch( type )
-        {
         case THRESH_BINARY:
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                 {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
-                    v0 = thresh2 < v0;
-                    v1 = thresh2 < v1;
-                    v0 = v0 & maxval2;
-                    v1 = v1 & maxval2;
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = thresh4 < v0;
+                    v1 = thresh4 < v1;
+                    v0 = v0 & maxval4;
+                    v1 = v1 & maxval4;
                     v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = thresh4 < v0;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                 }
 
                 for( ; j < roi.width; j++ )
-                    dst[j] = threshBinary<double>(src[j], thresh, maxval);
+                    dst[j] = threshBinary<float>(src[j], thresh, maxval);
             }
             break;
 
@@ -895,21 +818,29 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                 {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
-                    v0 = v0 <= thresh2;
-                    v1 = v1 <= thresh2;
-                    v0 = v0 & maxval2;
-                    v1 = v1 & maxval2;
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = v0 <= thresh4;
+                    v1 = v1 <= thresh4;
+                    v0 = v0 & maxval4;
+                    v1 = v1 & maxval4;
                     v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = v0 <= thresh4;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                 }
 
                 for( ; j < roi.width; j++ )
-                    dst[j] = threshBinaryInv<double>(src[j], thresh, maxval);
+                    dst[j] = threshBinaryInv<float>(src[j], thresh, maxval);
             }
             break;
 
@@ -917,19 +848,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = v_min( v0, thresh4 );
+                    v1 = v_min( v1, thresh4 );
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                 {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
-                    v0 = v_min( v0, thresh2 );
-                    v1 = v_min( v1, thresh2 );
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = v_min( v0, thresh4 );
                     v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                 }
 
                 for( ; j < roi.width; j++ )
-                    dst[j] = threshTrunc<double>(src[j], thresh);
+                    dst[j] = threshTrunc<float>(src[j], thresh);
             }
             break;
 
@@ -937,19 +875,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = ( thresh4 < v0 ) & v0;
+                    v1 = ( thresh4 < v1 ) & v1;
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                 {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
-                    v0 = ( thresh2 < v0 ) & v0;
-                    v1 = ( thresh2 < v1 ) & v1;
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = ( thresh4 < v0 ) & v0;
                     v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                 }
 
                 for( ; j < roi.width; j++ )
-                    dst[j] = threshToZero<double>(src[j], thresh);
+                    dst[j] = threshToZero<float>(src[j], thresh);
             }
             break;
 
@@ -957,30 +902,205 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = ( v0 <= thresh4 ) & v0;
+                    v1 = ( v1 <= thresh4 ) & v1;
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                 {
-                    v_float64x2 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 2 );
-                    v0 = ( v0 <= thresh2 ) & v0;
-                    v1 = ( v1 <= thresh2 ) & v1;
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = ( v0 <= thresh4 ) & v0;
                     v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                 }
 
                 for( ; j < roi.width; j++ )
-                    dst[j] = threshToZeroInv<double>(src[j], thresh);
+                    dst[j] = threshToZeroInv<float>(src[j], thresh);
             }
             break;
         default:
-            CV_Error(CV_StsBadArg, ""); return;
-        }
+            CV_Error( CV_StsBadArg, "" ); return;
     }
-    else
+#else
+    threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
+}
+
+static void
+thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
+{
+    Size roi = _src.size();
+    roi.width *= _src.channels();
+    const double* src = _src.ptr<double>();
+    double* dst = _dst.ptr<double>();
+    size_t src_step = _src.step / sizeof(src[0]);
+    size_t dst_step = _dst.step / sizeof(dst[0]);
+
+    if (_src.isContinuous() && _dst.isContinuous())
     {
-        threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
+        roi.width *= roi.height;
+        roi.height = 1;
     }
+
+#if CV_SIMD_64F
+    int i, j;
+    v_float64 thresh2 = vx_setall_f64( thresh );
+    v_float64 maxval2 = vx_setall_f64( maxval );
+
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = thresh2 < v0;
+                v1 = thresh2 < v1;
+                v0 = v0 & maxval2;
+                v1 = v1 & maxval2;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = thresh2 < v0;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinary<double>(src[j], thresh, maxval);
+        }
+        break;
+
+    case THRESH_BINARY_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = v0 <= thresh2;
+                v1 = v1 <= thresh2;
+                v0 = v0 & maxval2;
+                v1 = v1 & maxval2;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v0 <= thresh2;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinaryInv<double>(src[j], thresh, maxval);
+        }
+        break;
+
+    case THRESH_TRUNC:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = v_min( v0, thresh2 );
+                v1 = v_min( v1, thresh2 );
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh2 );
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshTrunc<double>(src[j], thresh);
+        }
+        break;
+
+    case THRESH_TOZERO:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = ( thresh2 < v0 ) & v0;
+                v1 = ( thresh2 < v1 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( thresh2 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZero<double>(src[j], thresh);
+        }
+        break;
+
+    case THRESH_TOZERO_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = ( v0 <= thresh2 ) & v0;
+                v1 = ( v1 <= thresh2 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh2 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZeroInv<double>(src[j], thresh);
+        }
+        break;
+    default:
+        CV_Error(CV_StsBadArg, ""); return;
+    }
+#else
+    threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
+#endif
 }
 
 #ifdef HAVE_IPP
diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp
index c36bc1fd6e..6ad51ad512 100644
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@@ -3062,4 +3062,14 @@ TEST(ImgProc_BGR2RGBA, 3ch24ch)
     EXPECT_DOUBLE_EQ(cvtest::norm(expected - dst, NORM_INF), 0.);
 }
 
+TEST(ImgProc_RGB2YUV, regression_13668)
+{
+    Mat src(Size(32, 4), CV_8UC3, Scalar(9, 250,  82));  // Ensure that SIMD code path works
+    Mat dst;
+    cvtColor(src, dst, COLOR_RGB2YUV);
+    Vec3b res = dst.at<Vec3b>(0, 0);
+    Vec3b ref(159, 90, 0);
+    EXPECT_EQ(res, ref);
+}
+
 }} // namespace
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index ec5d4007a3..fadff0c02b 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -387,7 +387,7 @@ bool QRDetect::computeTransformationPoints()
         findNonZero(mask_roi, non_zero_elem[i]);
         newHull.insert(newHull.end(), non_zero_elem[i].begin(), non_zero_elem[i].end());
     }
-    convexHull(Mat(newHull), locations);
+    convexHull(newHull, locations);
     for (size_t i = 0; i < locations.size(); i++)
     {
         for (size_t j = 0; j < 3; j++)
@@ -556,7 +556,7 @@ vector<Point2f> QRDetect::getQuadrilateral(vector<Point2f> angle_list)
     }
 
     vector<Point> integer_hull;
-    convexHull(Mat(locations), integer_hull);
+    convexHull(locations, integer_hull);
     int hull_size = (int)integer_hull.size();
     vector<Point2f> hull(hull_size);
     for (int i = 0; i < hull_size; i++)
@@ -901,7 +901,7 @@ bool QRDecode::versionDefinition()
     vector<Point> locations, non_zero_elem;
     Mat mask_roi = mask(Range(1, intermediate.rows - 1), Range(1, intermediate.cols - 1));
     findNonZero(mask_roi, non_zero_elem);
-    convexHull(Mat(non_zero_elem), locations);
+    convexHull(non_zero_elem, locations);
     Point offset = computeOffset(locations);
 
     Point temp_remote = locations[0], remote_point;
diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp
index 2e6251b210..e06dbbf379 100644
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -646,8 +646,6 @@ private:
         Size size = frame0.size();
         UMat prevFlowX, prevFlowY, curFlowX, curFlowY;
 
-        flowx.create(size, CV_32F);
-        flowy.create(size, CV_32F);
         UMat flowx0 = flowx;
         UMat flowy0 = flowy;
 
@@ -1075,12 +1073,19 @@ private:
             return false;
 
         std::vector<UMat> flowar;
-        if (!_flow0.empty())
+
+        // If flag is set, check for integrity; if not set, allocate memory space
+        if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
+        {
+            if (_flow0.empty() || _flow0.size() != _prev0.size() || _flow0.channels() != 2 ||
+                _flow0.depth() != CV_32F)
+                return false;
             split(_flow0, flowar);
+        }
         else
         {
-            flowar.push_back(UMat());
-            flowar.push_back(UMat());
+            flowar.push_back(UMat(_prev0.size(), CV_32FC1));
+            flowar.push_back(UMat(_prev0.size(), CV_32FC1));
         }
         if(!this->operator()(_prev0.getUMat(), _next0.getUMat(), flowar[0], flowar[1])){
             return false;
@@ -1112,7 +1117,14 @@ void FarnebackOpticalFlowImpl::calc(InputArray _prev0, InputArray _next0,
 
     CV_Assert( prev0.size() == next0.size() && prev0.channels() == next0.channels() &&
                prev0.channels() == 1 && pyrScale_ < 1 );
-    _flow0.create( prev0.size(), CV_32FC2 );
+
+    // If flag is set, check for integrity; if not set, allocate memory space
+    if( flags_ & OPTFLOW_USE_INITIAL_FLOW )
+        CV_Assert( _flow0.size() == prev0.size() && _flow0.channels() == 2 &&
+                   _flow0.depth() == CV_32F );
+    else
+        _flow0.create( prev0.size(), CV_32FC2 );
+
     Mat flow0 = _flow0.getMat();
 
     for( k = 0, scale = 1; k < levels; k++ )
diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml
index 0e7198a660..d177a09aab 100644
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@@ -90,6 +90,18 @@ squeezenet:
   classes: "classification_classes_ILSVRC2012.txt"
   sample: "classification"
 
+# Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
+googlenet:
+  model: "bvlc_googlenet.caffemodel"
+  config: "bvlc_googlenet.prototxt"
+  mean: [104, 117, 123]
+  scale: 1.0
+  width: 224
+  height: 224
+  rgb: false
+  classes: "classification_classes_ILSVRC2012.txt"
+  sample: "classification"
+
 ################################################################################
 # Semantic segmentation models.
 ################################################################################
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
index bf04c42174..a644420780 100644
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@@ -289,7 +289,7 @@ def removeUnusedNodesAndAttrs(to_remove, graph_def):
         op = graph_def.node[i].op
         name = graph_def.node[i].name
 
-        if op == 'Const' or to_remove(name, op):
+        if to_remove(name, op):
             if op != 'Const':
                 removedNodes.append(name)
 
diff --git a/samples/dnn/tf_text_graph_faster_rcnn.py b/samples/dnn/tf_text_graph_faster_rcnn.py
index 13a9c29ec0..e1dfba9fee 100644
--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@@ -48,10 +48,42 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
 
     removeIdentity(graph_def)
 
+    nodesToKeep = []
     def to_remove(name, op):
-        return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+        if name in nodesToKeep:
+            return False
+        return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
                (name.startswith('CropAndResize') and op != 'CropAndResize')
 
+    # Fuse atrous convolutions (with dilations).
+    nodesMap = {node.name: node for node in graph_def.node}
+    for node in reversed(graph_def.node):
+        if node.op == 'BatchToSpaceND':
+            del node.input[2]
+            conv = nodesMap[node.input[0]]
+            spaceToBatchND = nodesMap[conv.input[0]]
+
+            # Extract paddings
+            stridedSlice = nodesMap[spaceToBatchND.input[2]]
+            assert(stridedSlice.op == 'StridedSlice')
+            pack = nodesMap[stridedSlice.input[0]]
+            assert(pack.op == 'Pack')
+
+            padNodeH = nodesMap[nodesMap[pack.input[0]].input[0]]
+            padNodeW = nodesMap[nodesMap[pack.input[1]].input[0]]
+            padH = int(padNodeH.attr['value']['tensor'][0]['int_val'][0])
+            padW = int(padNodeW.attr['value']['tensor'][0]['int_val'][0])
+
+            paddingsNode = NodeDef()
+            paddingsNode.name = conv.name + '/paddings'
+            paddingsNode.op = 'Const'
+            paddingsNode.addAttr('value', [padH, padH, padW, padW])
+            graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
+            nodesToKeep.append(paddingsNode.name)
+
+            spaceToBatchND.input[2] = paddingsNode.name
+
+
     removeUnusedNodesAndAttrs(to_remove, graph_def)
 
 
@@ -225,6 +257,26 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
     detectionOut.addAttr('variance_encoded_in_target', True)
     graph_def.node.extend([detectionOut])
 
+    def getUnconnectedNodes():
+        unconnected = [node.name for node in graph_def.node]
+        for node in graph_def.node:
+            for inp in node.input:
+                if inp in unconnected:
+                    unconnected.remove(inp)
+        return unconnected
+
+    while True:
+        unconnectedNodes = getUnconnectedNodes()
+        unconnectedNodes.remove(detectionOut.name)
+        if not unconnectedNodes:
+            break
+
+        for name in unconnectedNodes:
+            for i in range(len(graph_def.node)):
+                if graph_def.node[i].name == name:
+                    del graph_def.node[i]
+                    break
+
     # Save as text.
     graph_def.save(outputPath)
 
diff --git a/samples/dnn/tf_text_graph_mask_rcnn.py b/samples/dnn/tf_text_graph_mask_rcnn.py
index aaefe456ad..c8803088f9 100644
--- a/samples/dnn/tf_text_graph_mask_rcnn.py
+++ b/samples/dnn/tf_text_graph_mask_rcnn.py
@@ -55,7 +55,7 @@ graph_def = parseTextGraph(args.output)
 removeIdentity(graph_def)
 
 def to_remove(name, op):
-    return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+    return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
            (name.startswith('CropAndResize') and op != 'CropAndResize')
 
 removeUnusedNodesAndAttrs(to_remove, graph_def)
diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py
index 5017dba7a7..1576380646 100644
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@@ -10,14 +10,60 @@
 # Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
 # See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
 import argparse
+import re
 from math import sqrt
 from tf_text_graph_common import *
 
+class SSDAnchorGenerator:
+    def __init__(self, min_scale, max_scale, num_layers, aspect_ratios,
+                 reduce_boxes_in_lowest_layer, image_width, image_height):
+        self.min_scale = min_scale
+        self.aspect_ratios = aspect_ratios
+        self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer
+        self.image_width = image_width
+        self.image_height = image_height
+        self.scales =  [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
+                            for i in range(num_layers)] + [1.0]
+
+    def get(self, layer_id):
+        if layer_id == 0 and self.reduce_boxes_in_lowest_layer:
+            widths = [0.1, self.min_scale * sqrt(2.0), self.min_scale * sqrt(0.5)]
+            heights = [0.1, self.min_scale / sqrt(2.0), self.min_scale / sqrt(0.5)]
+        else:
+            widths = [self.scales[layer_id] * sqrt(ar) for ar in self.aspect_ratios]
+            heights = [self.scales[layer_id] / sqrt(ar) for ar in self.aspect_ratios]
+
+            widths += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
+            heights += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
+        widths = [w * self.image_width for w in widths]
+        heights = [h * self.image_height for h in heights]
+        return widths, heights
+
+
+class MultiscaleAnchorGenerator:
+    def __init__(self, min_level, aspect_ratios, scales_per_octave, anchor_scale):
+        self.min_level = min_level
+        self.aspect_ratios = aspect_ratios
+        self.anchor_scale = anchor_scale
+        self.scales = [2**(float(s) / scales_per_octave) for s in range(scales_per_octave)]
+
+    def get(self, layer_id):
+        widths = []
+        heights = []
+        for a in self.aspect_ratios:
+            for s in self.scales:
+                base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
+                ar = sqrt(a)
+                heights.append(base_anchor_size * s / ar)
+                widths.append(base_anchor_size * s * ar)
+        return widths, heights
+
+
 def createSSDGraph(modelPath, configPath, outputPath):
     # Nodes that should be kept.
-    keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu6', 'Placeholder', 'FusedBatchNorm',
+    keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu', 'Relu6', 'Placeholder', 'FusedBatchNorm',
                'DepthwiseConv2dNative', 'ConcatV2', 'Mul', 'MaxPool', 'AvgPool', 'Identity',
-               'Sub']
+               'Sub', 'ResizeNearestNeighbor', 'Pad']
 
     # Node with which prefixes should be removed
     prefixesToRemove = ('MultipleGridAnchorGenerator/', 'Postprocessor/', 'Preprocessor/map')
@@ -27,26 +73,50 @@ def createSSDGraph(modelPath, configPath, outputPath):
     config = config['model'][0]['ssd'][0]
     num_classes = int(config['num_classes'][0])
 
-    ssd_anchor_generator = config['anchor_generator'][0]['ssd_anchor_generator'][0]
-    min_scale = float(ssd_anchor_generator['min_scale'][0])
-    max_scale = float(ssd_anchor_generator['max_scale'][0])
-    num_layers = int(ssd_anchor_generator['num_layers'][0])
-    aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']]
-    reduce_boxes_in_lowest_layer = True
-    if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator:
-        reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true'
-
     fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0]
     image_width = int(fixed_shape_resizer['width'][0])
     image_height = int(fixed_shape_resizer['height'][0])
 
     box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional'
 
+    anchor_generator = config['anchor_generator'][0]
+    if 'ssd_anchor_generator' in anchor_generator:
+        ssd_anchor_generator = anchor_generator['ssd_anchor_generator'][0]
+        min_scale = float(ssd_anchor_generator['min_scale'][0])
+        max_scale = float(ssd_anchor_generator['max_scale'][0])
+        num_layers = int(ssd_anchor_generator['num_layers'][0])
+        aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']]
+        reduce_boxes_in_lowest_layer = True
+        if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator:
+            reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true'
+        priors_generator = SSDAnchorGenerator(min_scale, max_scale, num_layers,
+                                              aspect_ratios, reduce_boxes_in_lowest_layer,
+                                              image_width, image_height)
+
+
+        print('Scale: [%f-%f]' % (min_scale, max_scale))
+        print('Aspect ratios: %s' % str(aspect_ratios))
+        print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer))
+    elif 'multiscale_anchor_generator' in anchor_generator:
+        multiscale_anchor_generator = anchor_generator['multiscale_anchor_generator'][0]
+        min_level = int(multiscale_anchor_generator['min_level'][0])
+        max_level = int(multiscale_anchor_generator['max_level'][0])
+        anchor_scale = float(multiscale_anchor_generator['anchor_scale'][0])
+        aspect_ratios = [float(ar) for ar in multiscale_anchor_generator['aspect_ratios']]
+        scales_per_octave = int(multiscale_anchor_generator['scales_per_octave'][0])
+        num_layers = max_level - min_level + 1
+        priors_generator = MultiscaleAnchorGenerator(min_level, aspect_ratios,
+                                                     scales_per_octave, anchor_scale)
+        print('Levels: [%d-%d]' % (min_level, max_level))
+        print('Anchor scale: %f' % anchor_scale)
+        print('Scales per octave: %d' % scales_per_octave)
+        print('Aspect ratios: %s' % str(aspect_ratios))
+    else:
+        print('Unknown anchor_generator')
+        exit(0)
+
     print('Number of classes: %d' % num_classes)
     print('Number of layers: %d' % num_layers)
-    print('Scale: [%f-%f]' % (min_scale, max_scale))
-    print('Aspect ratios: %s' % str(aspect_ratios))
-    print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer))
     print('box predictor: %s' % box_predictor)
     print('Input image size: %dx%d' % (image_width, image_height))
 
@@ -67,8 +137,8 @@ def createSSDGraph(modelPath, configPath, outputPath):
         return unconnected
 
 
-    # Detect unfused batch normalization nodes and fuse them.
-    def fuse_batch_normalization():
+    def fuse_nodes(nodesToKeep):
+        # Detect unfused batch normalization nodes and fuse them.
         # Add_0 <-- moving_variance, add_y
         # Rsqrt <-- Add_0
         # Mul_0 <-- Rsqrt, gamma
@@ -77,9 +147,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
         # Sub_0 <-- beta, Mul_2
         # Add_1 <-- Mul_1, Sub_0
         nodesMap = {node.name: node for node in graph_def.node}
-        subgraph = ['Add',
+        subgraphBatchNorm = ['Add',
             ['Mul', 'input', ['Mul', ['Rsqrt', ['Add', 'moving_variance', 'add_y']], 'gamma']],
             ['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
+        # Detect unfused nearest neighbor resize.
+        subgraphResizeNN = ['Reshape',
+            ['Mul', ['Reshape', 'input', ['Pack', 'shape_1', 'shape_2', 'shape_3', 'shape_4', 'shape_5']],
+                    'ones'],
+            ['Pack', ['StridedSlice', ['Shape', 'input'], 'stack', 'stack_1', 'stack_2'],
+                     'out_height', 'out_width', 'out_channels']]
         def checkSubgraph(node, targetNode, inputs, fusedNodes):
             op = targetNode[0]
             if node.op == op and (len(node.input) >= len(targetNode) - 1):
@@ -100,7 +176,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
         for node in graph_def.node:
             inputs = {}
             fusedNodes = []
-            if checkSubgraph(node, subgraph, inputs, fusedNodes):
+            if checkSubgraph(node, subgraphBatchNorm, inputs, fusedNodes):
                 name = node.name
                 node.Clear()
                 node.name = name
@@ -112,15 +188,41 @@ def createSSDGraph(modelPath, configPath, outputPath):
                 node.input.append(inputs['moving_variance'])
                 node.addAttr('epsilon', 0.001)
                 nodesToRemove += fusedNodes[1:]
+
+            inputs = {}
+            fusedNodes = []
+            if checkSubgraph(node, subgraphResizeNN, inputs, fusedNodes):
+                name = node.name
+                node.Clear()
+                node.name = name
+                node.op = 'ResizeNearestNeighbor'
+                node.input.append(inputs['input'])
+                node.input.append(name + '/output_shape')
+
+                out_height_node = nodesMap[inputs['out_height']]
+                out_width_node = nodesMap[inputs['out_width']]
+                out_height = int(out_height_node.attr['value']['tensor'][0]['int_val'][0])
+                out_width = int(out_width_node.attr['value']['tensor'][0]['int_val'][0])
+
+                shapeNode = NodeDef()
+                shapeNode.name = name + '/output_shape'
+                shapeNode.op = 'Const'
+                shapeNode.addAttr('value', [out_height, out_width])
+                graph_def.node.insert(graph_def.node.index(node), shapeNode)
+                nodesToKeep.append(shapeNode.name)
+
+                nodesToRemove += fusedNodes[1:]
         for node in nodesToRemove:
             graph_def.node.remove(node)
 
-    fuse_batch_normalization()
+    nodesToKeep = []
+    fuse_nodes(nodesToKeep)
 
     removeIdentity(graph_def)
 
     def to_remove(name, op):
-        return (not op in keepOps) or name.startswith(prefixesToRemove)
+        return (not name in nodesToKeep) and \
+               (op == 'Const' or (not op in keepOps) or name.startswith(prefixesToRemove))
 
     removeUnusedNodesAndAttrs(to_remove, graph_def)
 
@@ -169,19 +271,15 @@ def createSSDGraph(modelPath, configPath, outputPath):
             graph_def.node.extend([flatten])
         addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten')
 
-    idx = 0
+    num_matched_layers = 0
     for node in graph_def.node:
-        if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx) or \
-           node.name == ('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/Conv2D' % idx) or \
-           node.name == 'WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D':
+        if re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
+           re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name):
             node.addAttr('loc_pred_transposed', True)
-            idx += 1
-    assert(idx == num_layers)
+            num_matched_layers += 1
+    assert(num_matched_layers == num_layers)
 
     # Add layers that generate anchors (bounding boxes proposals).
-    scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
-              for i in range(num_layers)] + [1.0]
-
     priorBoxes = []
     for i in range(num_layers):
         priorBox = NodeDef()
@@ -199,17 +297,8 @@ def createSSDGraph(modelPath, configPath, outputPath):
         priorBox.addAttr('flip', False)
         priorBox.addAttr('clip', False)
 
-        if i == 0 and reduce_boxes_in_lowest_layer:
-            widths = [0.1, min_scale * sqrt(2.0), min_scale * sqrt(0.5)]
-            heights = [0.1, min_scale / sqrt(2.0), min_scale / sqrt(0.5)]
-        else:
-            widths = [scales[i] * sqrt(ar) for ar in aspect_ratios]
-            heights = [scales[i] / sqrt(ar) for ar in aspect_ratios]
+        widths, heights = priors_generator.get(i)
 
-            widths += [sqrt(scales[i] * scales[i + 1])]
-            heights += [sqrt(scales[i] * scales[i + 1])]
-        widths = [w * image_width for w in widths]
-        heights = [h * image_height for h in heights]
         priorBox.addAttr('width', widths)
         priorBox.addAttr('height', heights)
         priorBox.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
@@ -217,6 +306,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
         graph_def.node.extend([priorBox])
         priorBoxes.append(priorBox.name)
 
+    # Compare this layer's output with Postprocessor/Reshape
     addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
 
     # Sigmoid for classes predictions and DetectionOutput layer