diff --git a/CMakeLists.txt b/CMakeLists.txt index ad6ef10448..82c18f3508 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -577,7 +577,7 @@ else() # Note: layout differs from OpenCV 3.4 include(GNUInstallDirs) ocv_update(OPENCV_INCLUDE_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}/opencv4") - ocv_update(OPENCV_LIB_INSTALL_PATH "${CMAKE_INSTALL_LIBDIR}${LIB_SUFFIX}") + ocv_update(OPENCV_LIB_INSTALL_PATH "${CMAKE_INSTALL_LIBDIR}") ocv_update(OPENCV_CONFIG_INSTALL_PATH "${OPENCV_LIB_INSTALL_PATH}/cmake/opencv4") ocv_update(OPENCV_3P_LIB_INSTALL_PATH "${OPENCV_LIB_INSTALL_PATH}/opencv4/3rdparty") ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH "${CMAKE_INSTALL_DATAROOTDIR}/opencv4/samples") diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index a7ef8e1389..1d079a38d1 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -144,6 +144,7 @@ if(DOXYGEN_FOUND) string(REPLACE ";" " " CMAKE_DOXYGEN_ENABLED_SECTIONS "${CMAKE_DOXYGEN_ENABLED_SECTIONS}") # TODO: remove paths_doc from EXAMPLE_PATH after face module tutorials/samples moved to separate folders string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_EXAMPLE_PATH "${example_path} ; ${paths_doc} ; ${paths_sample}") + string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_INCLUDE_ROOTS "${paths_include}") set(CMAKE_DOXYGEN_LAYOUT "${CMAKE_CURRENT_BINARY_DIR}/DoxygenLayout.xml") set(CMAKE_DOXYGEN_OUTPUT_PATH "doxygen") set(CMAKE_DOXYGEN_MAIN_REFERENCE "${refs_main}") diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 9ac60f1b44..85f8fe161f 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -22,8 +22,8 @@ ABBREVIATE_BRIEF = "The $name class" \ ALWAYS_DETAILED_SEC = NO INLINE_INHERITED_MEMB = NO FULL_PATH_NAMES = YES -STRIP_FROM_PATH = @CMAKE_SOURCE_DIR@/modules -STRIP_FROM_INC_PATH = +STRIP_FROM_PATH = @CMAKE_SOURCE_DIR@/modules @CMAKE_DOXYGEN_INCLUDE_ROOTS@ +STRIP_FROM_INC_PATH = @CMAKE_DOXYGEN_INCLUDE_ROOTS@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = NO QT_AUTOBRIEF = NO @@ -72,8 +72,8 @@ INTERNAL_DOCS = NO CASE_SENSE_NAMES = YES HIDE_SCOPE_NAMES = NO SHOW_INCLUDE_FILES = YES -SHOW_GROUPED_MEMB_INC = NO -FORCE_LOCAL_INCLUDES = YES +SHOW_GROUPED_MEMB_INC = YES +FORCE_LOCAL_INCLUDES = NO INLINE_INFO = YES SORT_MEMBER_DOCS = YES SORT_BRIEF_DOCS = YES diff --git a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown index 8ab4c53908..ca7853d96a 100644 --- a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown +++ b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown @@ -53,8 +53,8 @@ import numpy as np import cv2 as cv import matplotlib.pyplot as plt -img1 = cv.imread('box.png',0) # queryImage -img2 = cv.imread('box_in_scene.png',0) # trainImage +img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE) # queryImage +img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage # Initiate ORB detector orb = cv.ORB_create() @@ -79,7 +79,7 @@ matches = bf.match(des1,des2) matches = sorted(matches, key = lambda x:x.distance) # Draw first 10 matches. -img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10], flags=2) +img3 = cv.drawMatches(img1,kp1,img2,kp2,matches[:10],None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS) plt.imshow(img3),plt.show() @endcode @@ -104,13 +104,13 @@ so that we can apply ratio test explained by D.Lowe in his paper. @code{.py} import numpy as np import cv2 as cv -from matplotlib import pyplot as plt +import matplotlib.pyplot as plt -img1 = cv.imread('box.png',0) # queryImage -img2 = cv.imread('box_in_scene.png',0) # trainImage +img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE) # queryImage +img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage # Initiate SIFT detector -sift = cv.SIFT() +sift = cv.xfeatures2d.SIFT_create() # find the keypoints and descriptors with SIFT kp1, des1 = sift.detectAndCompute(img1,None) @@ -118,7 +118,7 @@ kp2, des2 = sift.detectAndCompute(img2,None) # BFMatcher with default params bf = cv.BFMatcher() -matches = bf.knnMatch(des1,des2, k=2) +matches = bf.knnMatch(des1,des2,k=2) # Apply ratio test good = [] @@ -127,7 +127,7 @@ for m,n in matches: good.append([m]) # cv.drawMatchesKnn expects list of lists as matches. -img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,flags=2) +img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS) plt.imshow(img3),plt.show() @endcode @@ -168,13 +168,13 @@ With this information, we are good to go. @code{.py} import numpy as np import cv2 as cv -from matplotlib import pyplot as plt +import matplotlib.pyplot as plt -img1 = cv.imread('box.png',0) # queryImage -img2 = cv.imread('box_in_scene.png',0) # trainImage +img1 = cv.imread('box.png',cv.IMREAD_GRAYSCALE) # queryImage +img2 = cv.imread('box_in_scene.png',cv.IMREAD_GRAYSCALE) # trainImage # Initiate SIFT detector -sift = cv.SIFT() +sift = cv.xfeatures2d.SIFT_create() # find the keypoints and descriptors with SIFT kp1, des1 = sift.detectAndCompute(img1,None) @@ -190,7 +190,7 @@ flann = cv.FlannBasedMatcher(index_params,search_params) matches = flann.knnMatch(des1,des2,k=2) # Need to draw only good matches, so create a mask -matchesMask = [[0,0] for i in xrange(len(matches))] +matchesMask = [[0,0] for i in range(len(matches))] # ratio test as per Lowe's paper for i,(m,n) in enumerate(matches): @@ -200,7 +200,7 @@ for i,(m,n) in enumerate(matches): draw_params = dict(matchColor = (0,255,0), singlePointColor = (255,0,0), matchesMask = matchesMask, - flags = 0) + flags = cv.DrawMatchesFlags_DEFAULT) img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,matches,None,**draw_params) diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp index 0a21231c4d..481fafd452 100644 --- a/modules/calib3d/src/circlesgrid.cpp +++ b/modules/calib3d/src/circlesgrid.cpp @@ -156,7 +156,7 @@ void CirclesGridClusterFinder::findGrid(const std::vector &points, #endif std::vector hull2f; - convexHull(Mat(patternPoints), hull2f, false); + convexHull(patternPoints, hull2f, false); const size_t cornersCount = isAsymmetricGrid ? 6 : 4; if(hull2f.size() < cornersCount) return; @@ -407,7 +407,7 @@ void CirclesGridClusterFinder::rectifyPatternPoints(const std::vector dstKeypoints; convertPointsFromHomogeneous(dstKeypointsMat, dstKeypoints); @@ -1168,7 +1168,7 @@ void CirclesGridFinder::findBasis(const std::vector &samples, std::vect } for (size_t i = 0; i < basis.size(); i++) { - convexHull(Mat(clusters[i]), hulls[i]); + convexHull(clusters[i], hulls[i]); } basisGraphs.resize(basis.size(), Graph(keypoints.size())); @@ -1183,7 +1183,7 @@ void CirclesGridFinder::findBasis(const std::vector &samples, std::vect for (size_t k = 0; k < hulls.size(); k++) { - if (pointPolygonTest(Mat(hulls[k]), vec, false) >= 0) + if (pointPolygonTest(hulls[k], vec, false) >= 0) { basisGraphs[k].addEdge(i, j); } @@ -1414,7 +1414,6 @@ void CirclesGridFinder::drawHoles(const Mat &srcImage, Mat &drawImage) const if (i != holes.size() - 1) line(drawImage, keypoints[holes[i][j]], keypoints[holes[i + 1][j]], Scalar(255, 0, 0), 2); - //circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness); circle(drawImage, keypoints[holes[i][j]], holeRadius, holeColor, holeThickness); } } diff --git a/modules/calib3d/src/homography_decomp.cpp b/modules/calib3d/src/homography_decomp.cpp index fea8882c5a..3bfb62ec2c 100644 --- a/modules/calib3d/src/homography_decomp.cpp +++ b/modules/calib3d/src/homography_decomp.cpp @@ -185,6 +185,10 @@ bool HomographyDecompZhang::findMotionFrom_tstar_n(const cv::Vec3d& tstar, const temp(1, 1) += 1.0; temp(2, 2) += 1.0; motion.R = getHnorm() * temp.inv(); + if (cv::determinant(motion.R) < 0) + { + motion.R *= -1; + } motion.t = motion.R * tstar; motion.n = n; return passesSameSideOfPlaneConstraint(motion); @@ -312,6 +316,10 @@ void HomographyDecompInria::findRmatFrom_tstar_n(const cv::Vec3d& tstar, const c 0.0, 0.0, 1.0); R = getHnorm() * (I - (2/v) * tstar_m * n_m.t() ); + if (cv::determinant(R) < 0) + { + R *= -1; + } } void HomographyDecompInria::decompose(std::vector& camMotions) diff --git a/modules/calib3d/src/quadsubpix.cpp b/modules/calib3d/src/quadsubpix.cpp index 77bc498591..b4100a22f9 100644 --- a/modules/calib3d/src/quadsubpix.cpp +++ b/modules/calib3d/src/quadsubpix.cpp @@ -194,9 +194,8 @@ bool cv::find4QuadCornerSubpix(InputArray _img, InputOutputArray _corners, Size erode(white_comp, white_comp, Mat(), Point(-1, -1), erode_count); std::vector > white_contours, black_contours; - std::vector white_hierarchy, black_hierarchy; - findContours(black_comp, black_contours, black_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE); - findContours(white_comp, white_contours, white_hierarchy, RETR_LIST, CHAIN_APPROX_SIMPLE); + findContours(black_comp, black_contours, RETR_LIST, CHAIN_APPROX_SIMPLE); + findContours(white_comp, white_contours, RETR_LIST, CHAIN_APPROX_SIMPLE); if(black_contours.size() < 5 || white_contours.size() < 5) continue; diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp index c013d5adf8..74dd272943 100644 --- a/modules/calib3d/test/test_cameracalibration.cpp +++ b/modules/calib3d/test/test_cameracalibration.cpp @@ -1408,7 +1408,7 @@ bool CV_StereoCalibrationTest::checkPandROI( int test_case_idx, const Mat& M, co for( x = 0; x < N; x++ ) pts.push_back(Point2f((float)x*imgsize.width/(N-1), (float)y*imgsize.height/(N-1))); - undistortPoints(Mat(pts), upts, M, D, R, P ); + undistortPoints(pts, upts, M, D, R, P ); for( k = 0; k < N*N; k++ ) if( upts[k].x < -imgsize.width*eps || upts[k].x > imgsize.width*(1+eps) || upts[k].y < -imgsize.height*eps || upts[k].y > imgsize.height*(1+eps) ) @@ -1717,8 +1717,8 @@ void CV_StereoCalibrationTest::run( int ) for( int i = 0, k = 0; i < nframes; i++ ) { vector temp[2]; - undistortPoints(Mat(imgpt1[i]), temp[0], M1, D1, R1, P1); - undistortPoints(Mat(imgpt2[i]), temp[1], M2, D2, R2, P2); + undistortPoints(imgpt1[i], temp[0], M1, D1, R1, P1); + undistortPoints(imgpt2[i], temp[1], M2, D2, R2, P2); for( int j = 0; j < npoints; j++, k++ ) { diff --git a/modules/calib3d/test/test_cameracalibration_artificial.cpp b/modules/calib3d/test/test_cameracalibration_artificial.cpp index 165a66a7b1..a8351b6b66 100644 --- a/modules/calib3d/test/test_cameracalibration_artificial.cpp +++ b/modules/calib3d/test/test_cameracalibration_artificial.cpp @@ -353,7 +353,7 @@ protected: rvecs_spnp.resize(brdsNum); tvecs_spnp.resize(brdsNum); for(size_t i = 0; i < brdsNum; ++i) - solvePnP(Mat(objectPoints[i]), Mat(imagePoints[i]), camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]); + solvePnP(objectPoints[i], imagePoints[i], camMat, distCoeffs, rvecs_spnp[i], tvecs_spnp[i]); compareShiftVecs(tvecs_exp, tvecs_spnp); compareRotationVecs(rvecs_exp, rvecs_spnp); diff --git a/modules/calib3d/test/test_chessboardgenerator.cpp b/modules/calib3d/test/test_chessboardgenerator.cpp index 3a8c17345f..6926cb6e72 100644 --- a/modules/calib3d/test/test_chessboardgenerator.cpp +++ b/modules/calib3d/test/test_chessboardgenerator.cpp @@ -126,10 +126,10 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co generateEdge(p3, p4, pts_square3d); generateEdge(p4, p1, pts_square3d); - projectPoints(Mat(pts_square3d), rvec, tvec, camMat, distCoeffs, pts_square2d); + projectPoints(pts_square3d, rvec, tvec, camMat, distCoeffs, pts_square2d); squares_black.resize(squares_black.size() + 1); vector temp; - approxPolyDP(Mat(pts_square2d), temp, 1.0, true); + approxPolyDP(pts_square2d, temp, 1.0, true); transform(temp.begin(), temp.end(), back_inserter(squares_black.back()), Mult(rendererResolutionMultiplier)); } @@ -139,7 +139,7 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co for(int i = 0; i < patternSize.width - 1; ++i) corners3d.push_back(zero + (i + 1) * sqWidth * pb1 + (j + 1) * sqHeight * pb2); corners.clear(); - projectPoints(Mat(corners3d), rvec, tvec, camMat, distCoeffs, corners); + projectPoints(corners3d, rvec, tvec, camMat, distCoeffs, corners); vector whole3d; vector whole2d; @@ -147,9 +147,9 @@ Mat ChessBoardGenerator::generateChessBoard(const Mat& bg, const Mat& camMat, co generateEdge(whole[1], whole[2], whole3d); generateEdge(whole[2], whole[3], whole3d); generateEdge(whole[3], whole[0], whole3d); - projectPoints(Mat(whole3d), rvec, tvec, camMat, distCoeffs, whole2d); + projectPoints(whole3d, rvec, tvec, camMat, distCoeffs, whole2d); vector temp_whole2d; - approxPolyDP(Mat(whole2d), temp_whole2d, 1.0, true); + approxPolyDP(whole2d, temp_whole2d, 1.0, true); vector< vector > whole_contour(1); transform(temp_whole2d.begin(), temp_whole2d.end(), @@ -213,7 +213,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2; /* can remake with better perf */ - projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d); + projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d); bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0; bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0; @@ -278,7 +278,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2; /* can remake with better perf */ - projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d); + projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d); bool inrect1 = pts2d[0].x < bg.cols && pts2d[0].y < bg.rows && pts2d[0].x > 0 && pts2d[0].y > 0; bool inrect2 = pts2d[1].x < bg.cols && pts2d[1].y < bg.rows && pts2d[1].x > 0 && pts2d[1].y > 0; @@ -320,7 +320,7 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat pts3d[3] = p - pb1 * cbHalfWidthEx + cbHalfHeightEx * pb2; /* can remake with better perf */ - projectPoints(Mat(pts3d), rvec, tvec, camMat, distCoeffs, pts2d); + projectPoints(pts3d, rvec, tvec, camMat, distCoeffs, pts2d); Point3f zero = p - pb1 * cbHalfWidth - cbHalfHeight * pb2; diff --git a/modules/calib3d/test/test_homography_decomp.cpp b/modules/calib3d/test/test_homography_decomp.cpp index 45f5ae63ee..9ddc0e913d 100644 --- a/modules/calib3d/test/test_homography_decomp.cpp +++ b/modules/calib3d/test/test_homography_decomp.cpp @@ -134,4 +134,36 @@ private: TEST(Calib3d_DecomposeHomography, regression) { CV_HomographyDecompTest test; test.safe_run(); } + +TEST(Calib3d_DecomposeHomography, issue_4978) +{ + Matx33d K( + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0 + ); + + Matx33d H( + -0.102896, 0.270191, -0.0031153, + 0.0406387, 1.19569, -0.0120456, + 0.445351, 0.0410889, 1 + ); + + vector rotations; + vector translations; + vector normals; + + decomposeHomographyMat(H, K, rotations, translations, normals); + + ASSERT_GT(rotations.size(), (size_t)0u); + for (size_t i = 0; i < rotations.size(); i++) + { + // check: det(R) = 1 + EXPECT_TRUE(std::fabs(cv::determinant(rotations[i]) - 1.0) < 0.01) + << "R: det=" << cv::determinant(rotations[0]) << std::endl << rotations[i] << std::endl + << "T:" << std::endl << translations[i] << std::endl; + } +} + + }} // namespace diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp index 8eec7a7167..2359fa9282 100644 --- a/modules/calib3d/test/test_solvepnp_ransac.cpp +++ b/modules/calib3d/test/test_solvepnp_ransac.cpp @@ -124,7 +124,7 @@ protected: vector projectedPoints; projectedPoints.resize(points.size()); - projectPoints(Mat(points), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); + projectPoints(points, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); for (size_t i = 0; i < projectedPoints.size(); i++) { if (i % 20 == 0) @@ -241,7 +241,7 @@ protected: vector projectedPoints; projectedPoints.resize(opoints.size()); - projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); + projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); bool isEstimateSuccess = solvePnP(opoints, projectedPoints, intrinsics, distCoeffs, rvec, tvec, false, method); if (isEstimateSuccess == false) @@ -291,7 +291,7 @@ class CV_solveP3P_Test : public CV_solvePnPRansac_Test vector projectedPoints; projectedPoints.resize(opoints.size()); - projectPoints(Mat(opoints), trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); + projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints); int num_of_solutions = solveP3P(opoints, projectedPoints, intrinsics, distCoeffs, rvecs, tvecs, method); if (num_of_solutions != (int) rvecs.size() || num_of_solutions != (int) tvecs.size() || num_of_solutions == 0) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index bf9519dd1c..0e2c66b1f1 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -186,6 +186,16 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard # endif #endif +#ifndef CV_ALWAYS_INLINE +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) +#define CV_ALWAYS_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define CV_ALWAYS_INLINE __forceinline +#else +#define CV_ALWAYS_INLINE inline +#endif +#endif + #if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED) # define CV_ENABLE_UNROLLED 0 #else diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp index b41efdae1b..b14f4f66cc 100644 --- a/modules/dnn/include/opencv2/dnn/version.hpp +++ b/modules/dnn/include/opencv2/dnn/version.hpp @@ -6,7 +6,7 @@ #define OPENCV_DNN_VERSION_HPP /// Use with major OpenCV version only. -#define OPENCV_DNN_API_VERSION 20181221 +#define OPENCV_DNN_API_VERSION 20190122 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION) diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index cc95cc58ae..d06689a7fb 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -157,8 +157,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow) PERF_TEST_P_(DNNTestNetwork, DenseNet_121) { - if (backend == DNN_BACKEND_HALIDE || - (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD))) + if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "", Mat(cv::Size(224, 224), CV_32FC3)); @@ -211,8 +210,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow) PERF_TEST_P_(DNNTestNetwork, YOLOv3) { - if (backend == DNN_BACKEND_HALIDE || - (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)) + if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); Mat sample = imread(findDataFile("dnn/dog416.png", false)); Mat inp; @@ -222,8 +220,11 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3) PERF_TEST_P_(DNNTestNetwork, EAST_text_detection) { - if (backend == DNN_BACKEND_HALIDE || - (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)) + if (backend == DNN_BACKEND_HALIDE +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE < 2018030000 + || (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) +#endif + ) throw SkipTestException(""); processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3)); } diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 1282f59e8f..95643a287b 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -707,12 +707,6 @@ struct DataLayer : public Layer virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE - InferenceEngine::LayerParams lp; - lp.name = name; - lp.type = "ScaleShift"; - lp.precision = InferenceEngine::Precision::FP32; - std::shared_ptr ieLayer(new InferenceEngine::ScaleShiftLayer(lp)); - CV_CheckEQ(inputsData.size(), (size_t)1, ""); CV_CheckEQ(inputsData[0].dims, 4, ""); const size_t numChannels = inputsData[0].size[1]; @@ -723,7 +717,6 @@ struct DataLayer : public Layer {numChannels}); weights->allocate(); weights->set(std::vector(numChannels, scaleFactors[0])); - ieLayer->_weights = weights; // Mean subtraction auto biases = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, @@ -735,8 +728,21 @@ struct DataLayer : public Layer biasesVec[i] = -means[0][i] * scaleFactors[0]; } biases->set(biasesVec); - ieLayer->_biases = biases; +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ScaleShiftLayer ieLayer(name); + ieLayer.setWeights(weights); + ieLayer.setBiases(biases); +#else + InferenceEngine::LayerParams lp; + lp.name = name; + lp.type = "ScaleShift"; + lp.precision = InferenceEngine::Precision::FP32; + std::shared_ptr ieLayer(new InferenceEngine::ScaleShiftLayer(lp)); + + ieLayer->_weights = weights; + ieLayer->_biases = biases; +#endif return Ptr(new InfEngineBackendNode(ieLayer)); #endif // HAVE_INF_ENGINE return Ptr(); @@ -1480,7 +1486,11 @@ struct Net::Impl if (layerNet != ieInpNode->net) { // layerNet is empty or nodes are from different graphs. +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + ieInpNode->net->addOutput(ieInpNode->layer.getName()); +#else ieInpNode->net->addOutput(ieInpNode->layer->name); +#endif } } } @@ -1590,7 +1600,7 @@ struct Net::Impl // Build Inference Engine networks from sets of layers that support this // backend. Split a whole model on several Inference Engine networks if - // some of layers is not implemented. + // some of layers are not implemented. // Set of all input and output blobs wrappers for current network. std::map > netBlobsWrappers; @@ -1606,7 +1616,7 @@ struct Net::Impl { addInfEngineNetOutputs(ld); net = Ptr(); - netBlobsWrappers.clear(); + netBlobsWrappers.clear(); // Is not used for R5 release but we don't wrap it to #ifdef. layer->preferableTarget = DNN_TARGET_CPU; continue; } @@ -1624,12 +1634,13 @@ struct Net::Impl if (ieInpNode->net != net) { net = Ptr(); - netBlobsWrappers.clear(); + netBlobsWrappers.clear(); // Is not used for R5 release but we don't wrap it to #ifdef. break; } } } +#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) // The same blobs wrappers cannot be shared between two Inference Engine // networks because of explicit references between layers and blobs. // So we need to rewrap all the external blobs. @@ -1646,6 +1657,7 @@ struct Net::Impl ld.inputBlobsWrappers[i] = it->second; } netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0]; +#endif // IE < R5 Ptr node; if (!net.empty()) @@ -1676,6 +1688,40 @@ struct Net::Impl CV_Assert(!ieNode.empty()); ieNode->net = net; + // Convert weights in FP16 for specific targets. +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || + preferableTarget == DNN_TARGET_MYRIAD || + preferableTarget == DNN_TARGET_FPGA) && !fused) + { + auto& blobs = ieNode->layer.getConstantData(); + if (blobs.empty()) + { + // In case of non weightable layer we have to specify + // it's precision adding dummy blob. + auto blob = InferenceEngine::make_shared_blob( + InferenceEngine::Precision::FP16, + InferenceEngine::Layout::C, {1}); + blob->allocate(); + blobs[""] = blob; + } + else + { + for (auto& it : blobs) + it.second = convertFp16(std::const_pointer_cast(it.second)); + } + } + + if (!fused) + net->addLayer(ieNode->layer); + + net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName()); + net->addBlobs(ld.inputBlobsWrappers); + net->addBlobs(ld.outputBlobsWrappers); + addInfEngineNetOutputs(ld); + +#else // IE >= R5 + auto weightableLayer = std::dynamic_pointer_cast(ieNode->layer); if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD || @@ -1713,10 +1759,10 @@ struct Net::Impl if (!fused) net->addLayer(ieNode->layer); addInfEngineNetOutputs(ld); +#endif // IE >= R5 } // Initialize all networks. - std::set initializedNets; for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it) { LayerData &ld = it->second; @@ -2622,7 +2668,11 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin) Net cvNet; cvNet.setInputsNames(inputsNames); +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + Ptr backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer(""))); +#else Ptr backendNode(new InfEngineBackendNode(0)); +#endif backendNode->net = Ptr(new InfEngineBackendNet(ieNet)); for (auto& it : ieNet.getOutputsInfo()) { diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 9a1707a3e8..522d0229ba 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -349,6 +349,14 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ScaleShiftLayer ieLayer(name); + + const size_t numChannels = weights_.total(); + ieLayer.setWeights(wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C)); + ieLayer.setBiases(wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "ScaleShift"; @@ -360,6 +368,7 @@ public: ieLayer->_biases = wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 1eb149b3d1..9f8590bea7 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -110,6 +110,11 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::SplitLayer ieLayer(name); + ieLayer.setOutputPorts({InferenceEngine::Port()}); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); CV_Assert(!input->dims.empty()); @@ -123,6 +128,7 @@ public: ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]); #endif return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index bea2017729..19ab915ea6 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -313,6 +313,14 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); + + InferenceEngine::Builder::ConcatLayer ieLayer(name); + ieLayer.setAxis(clamp(axis, input->dims.size())); + ieLayer.setInputPorts(std::vector(inputs.size())); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::LayerParams lp; lp.name = name; @@ -321,6 +329,7 @@ public: std::shared_ptr ieLayer(new InferenceEngine::ConcatLayer(lp)); ieLayer->_axis = clamp(axis, input->dims.size()); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index fd31d9cc12..21a13c8d47 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -521,6 +521,54 @@ public: const int inpGroupCn = blobs[0].size[1]; const int group = inpCn / inpGroupCn; + auto ieWeights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW); + if (newWeightAndBias) + { + if (weightsMat.isContinuous()) + { + Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size); + ieWeights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW); + } + else + { + ieWeights = InferenceEngine::make_shared_blob( + InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW, + ieWeights->dims()); + ieWeights->allocate(); + + Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn); + Mat fusedWeights = weightsMat.colRange(0, newWeights.cols); + fusedWeights.copyTo(newWeights); + } + } + InferenceEngine::Blob::Ptr ieBiases; + if (hasBias() || fusedBias) + { + Mat biasesMat({outCn}, CV_32F, &biasvec[0]); + ieBiases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C); + } + +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ConvolutionLayer ieLayer(name); + + ieLayer.setKernel({kernel.height, kernel.width}); + ieLayer.setStrides({stride.height, stride.width}); + ieLayer.setDilation({dilation.height, dilation.width}); + ieLayer.setPaddingsBegin({pad.height, pad.width}); + ieLayer.setPaddingsEnd({pad.height, pad.width}); + ieLayer.setGroup(group); + ieLayer.setOutDepth(outCn); + + ieLayer.setWeights(ieWeights); + if (ieBiases) + ieLayer.setBiases(ieBiases); + + InferenceEngine::Builder::Layer l = ieLayer; + if (!padMode.empty()) + l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper"); + + return Ptr(new InfEngineBackendNode(l)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Convolution"; @@ -557,32 +605,11 @@ public: ieLayer->_out_depth = outCn; ieLayer->_group = group; - ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW); - if (newWeightAndBias) - { - if (weightsMat.isContinuous()) - { - Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size); - ieLayer->_weights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW); - } - else - { - ieLayer->_weights = InferenceEngine::make_shared_blob( - InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW, - ieLayer->_weights->dims()); - ieLayer->_weights->allocate(); - - Mat newWeights = infEngineBlobToMat(ieLayer->_weights).reshape(1, outCn); - Mat fusedWeights = weightsMat.colRange(0, newWeights.cols); - fusedWeights.copyTo(newWeights); - } - } - if (hasBias() || fusedBias) - { - Mat biasesMat({outCn}, CV_32F, &biasvec[0]); - ieLayer->_biases = wrapToInfEngineBlob(biasesMat, {(size_t)outCn}, InferenceEngine::Layout::C); - } + ieLayer->_weights = ieWeights; + if (ieBiases) + ieLayer->_biases = ieBiases; return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } @@ -1193,6 +1220,9 @@ public: #ifdef HAVE_INF_ENGINE if (backendId == DNN_BACKEND_INFERENCE_ENGINE) { + if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width)) + return false; + const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout const int group = numOutput / outGroupCn; if (group != 1) @@ -1747,6 +1777,27 @@ public: virtual Ptr initInfEngine(const std::vector > &) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout + const int group = numOutput / outGroupCn; + + InferenceEngine::Builder::DeconvolutionLayer ieLayer(name); + + ieLayer.setKernel({kernel.height, kernel.width}); + ieLayer.setStrides({stride.height, stride.width}); + ieLayer.setDilation({dilation.height, dilation.width}); + ieLayer.setPaddingsBegin({pad.height, pad.width}); + ieLayer.setPaddingsEnd({pad.height, pad.width}); + ieLayer.setGroup(group); + ieLayer.setOutDepth(numOutput); + + ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW)); + if (hasBias()) + { + ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C)); + } + return Ptr(new InfEngineBackendNode(ieLayer)); +#else const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout const int group = numOutput / outGroupCn; @@ -1786,6 +1837,7 @@ public: ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C); } return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/crop_layer.cpp b/modules/dnn/src/layers/crop_layer.cpp index 32cdbbaa00..c7cd99c9aa 100644 --- a/modules/dnn/src/layers/crop_layer.cpp +++ b/modules/dnn/src/layers/crop_layer.cpp @@ -67,8 +67,12 @@ public: virtual bool supportBackend(int backendId) CV_OVERRIDE { - return backendId == DNN_BACKEND_OPENCV || - (backendId == DNN_BACKEND_INFERENCE_ENGINE && crop_ranges.size() == 4); +#ifdef HAVE_INF_ENGINE + if (backendId == DNN_BACKEND_INFERENCE_ENGINE) + return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) && crop_ranges.size() == 4; + else +#endif + return backendId == DNN_BACKEND_OPENCV; } bool getMemoryShapes(const std::vector &inputs, @@ -145,9 +149,10 @@ public: input(&crop_ranges[0]).copyTo(outputs[0]); } +#ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { -#ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Crop"; @@ -181,9 +186,11 @@ public: ieLayer->dim.push_back(crop_ranges[3].end - crop_ranges[3].start); #endif return Ptr(new InfEngineBackendNode(ieLayer)); -#endif // HAVE_INF_ENGINE +#else return Ptr(); +#endif // IE < R5 } +#endif std::vector crop_ranges; }; diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp index 2a21619d6c..cc87a120a8 100644 --- a/modules/dnn/src/layers/detection_output_layer.cpp +++ b/modules/dnn/src/layers/detection_output_layer.cpp @@ -939,6 +939,25 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::DetectionOutputLayer ieLayer(name); + + ieLayer.setNumClasses(_numClasses); + ieLayer.setShareLocation(_shareLocation); + ieLayer.setBackgroudLabelId(_backgroundLabelId); + ieLayer.setNMSThreshold(_nmsThreshold); + ieLayer.setTopK(_topK); + ieLayer.setKeepTopK(_keepTopK); + ieLayer.setConfidenceThreshold(_confidenceThreshold); + ieLayer.setVariantEncodedInTarget(_varianceEncodedInTarget); + ieLayer.setCodeType("caffe.PriorBoxParameter." + _codeType); + ieLayer.setInputPorts(std::vector(3)); + + InferenceEngine::Builder::Layer l = ieLayer; + l.getParameters()["eta"] = std::string("1.0"); + + return Ptr(new InfEngineBackendNode(l)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "DetectionOutput"; @@ -956,6 +975,7 @@ public: ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0"; ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType; return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 4ab8ed3a44..8fb596252f 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -153,10 +153,16 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer ieLayer = func.initInfEngineBuilderAPI(); + ieLayer.setName(this->name); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = this->name; lp.precision = InferenceEngine::Precision::FP32; return Ptr(new InfEngineBackendNode(func.initInfEngine(lp))); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } @@ -355,6 +361,12 @@ struct ReLUFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(slope); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "ReLU"; @@ -363,6 +375,7 @@ struct ReLUFunctor ieLayer->params["negative_slope"] = format("%f", slope); return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -472,6 +485,12 @@ struct ReLU6Functor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::ClampLayer("").setMinValue(minValue).setMaxValue(maxValue); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "Clamp"; @@ -482,6 +501,7 @@ struct ReLU6Functor ieLayer->params["max"] = format("%f", maxValue); return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -558,12 +578,19 @@ struct TanHFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::TanHLayer(""); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "TanH"; std::shared_ptr ieLayer(new InferenceEngine::CNNLayer(lp)); return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -640,12 +667,19 @@ struct SigmoidFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::SigmoidLayer(""); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "Sigmoid"; std::shared_ptr ieLayer(new InferenceEngine::CNNLayer(lp)); return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -724,11 +758,18 @@ struct ELUFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::ELULayer(""); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "ELU"; return InferenceEngine::CNNLayerPtr(new InferenceEngine::CNNLayer(lp)); } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -805,6 +846,12 @@ struct AbsValFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::ReLULayer("").setNegativeSlope(-1); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "ReLU"; @@ -813,6 +860,7 @@ struct AbsValFunctor ieLayer->params["negative_slope"] = "-1.0"; return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -868,11 +916,18 @@ struct BNLLFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + CV_Error(Error::StsNotImplemented, ""); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { CV_Error(Error::StsNotImplemented, "BNLL"); return InferenceEngine::CNNLayerPtr(); } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -985,6 +1040,14 @@ struct PowerFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + return InferenceEngine::Builder::PowerLayer("").setPower(power) + .setScale(scale) + .setShift(shift); + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { if (power == 1.0f && scale == 1.0f && shift == 0.0f) @@ -1004,6 +1067,7 @@ struct PowerFunctor return ieLayer; } } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN @@ -1143,6 +1207,15 @@ struct ChannelsPReLUFunctor #endif // HAVE_HALIDE #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + InferenceEngine::Builder::PReLULayer ieLayer(""); + const size_t numChannels = scale.total(); + ieLayer.setWeights(wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C)); + return ieLayer; + } +#else InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp) { lp.type = "PReLU"; @@ -1151,6 +1224,7 @@ struct ChannelsPReLUFunctor ieLayer->_weights = wrapToInfEngineBlob(scale, {numChannels}, InferenceEngine::Layout::C); return ieLayer; } +#endif #endif // HAVE_INF_ENGINE #ifdef HAVE_VULKAN diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index e0895b7f45..ed6da9e1a4 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -99,7 +99,7 @@ public: return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || (backendId == DNN_BACKEND_INFERENCE_ENGINE && - (preferableTarget != DNN_TARGET_MYRIAD || coeffs.empty())); + (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty())); } bool getMemoryShapes(const std::vector &inputs, @@ -420,9 +420,29 @@ public: return Ptr(); } - virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE + virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::EltwiseLayer ieLayer(name); + + ieLayer.setInputPorts(std::vector(inputs.size())); + + if (op == SUM) + ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM); + else if (op == PROD) + ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL); + else if (op == MAX) + ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX); + else + CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation"); + + InferenceEngine::Builder::Layer l = ieLayer; + if (!coeffs.empty()) + l.getParameters()["coeff"] = coeffs; + + return Ptr(new InfEngineBackendNode(l)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Eltwise"; @@ -438,6 +458,7 @@ public: else CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation"); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index e3382f2d53..3a704dca81 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -152,9 +152,19 @@ public: } } - virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE + virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer ieLayer(name); + ieLayer.setName(name); + ieLayer.setType("Flatten"); + ieLayer.getParameters()["axis"] = _startAxis; + ieLayer.getParameters()["end_axis"] = _endAxis; + ieLayer.setInputPorts(std::vector(1)); + ieLayer.setOutputPorts(std::vector(1)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Flatten"; @@ -163,6 +173,7 @@ public: ieLayer->params["axis"] = format("%d", _startAxis); ieLayer->params["end_axis"] = format("%d", _endAxis); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 78d3e809b5..3a71a872fe 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -442,6 +442,18 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::FullyConnectedLayer ieLayer(name); + + const int outNum = blobs[0].size[0]; + ieLayer.setOutputNum(outNum); + + ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW)); + if (blobs.size() > 1) + ieLayer.setBiases(wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C)); + + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "FullyConnected"; @@ -456,6 +468,7 @@ public: if (blobs.size() > 1) ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index b92610272b..fbd0c6ac59 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -393,6 +393,17 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::NormLayer ieLayer(name); + ieLayer.setSize(size); + ieLayer.setAlpha(alpha); + ieLayer.setBeta(beta); + ieLayer.setAcrossMaps(type == CHANNEL_NRM); + + InferenceEngine::Builder::Layer l = ieLayer; + l.getParameters()["k"] = bias; + return Ptr(new InfEngineBackendNode(l)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Norm"; @@ -405,6 +416,7 @@ public: ieLayer->_alpha = alpha; ieLayer->_isAcrossMaps = (type == CHANNEL_NRM); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 93dd5f05f6..772902ca01 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -371,6 +371,13 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::MVNLayer ieLayer(name); + ieLayer.setAcrossChannels(acrossChannels); + ieLayer.setNormalize(normVariance); + ieLayer.setEpsilon(eps); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "MVN"; @@ -380,6 +387,7 @@ public: ieLayer->params["normalize_variance"] = normVariance ? "1" : "0"; ieLayer->params["eps"] = format("%f", eps); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index b3ca64f24a..4766f1704e 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -264,6 +264,49 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); + if (input->dims.size() == 4) + { + InferenceEngine::Builder::NormalizeLayer ieLayer(name); + + ieLayer.setChannelShared(false); + ieLayer.setAcrossMaps(acrossSpatial); + ieLayer.setEpsilon(epsilon); + + InferenceEngine::Builder::Layer l = ieLayer; + const int numChannels = input->dims[2]; // NOTE: input->dims are reversed (whcn) + if (blobs.empty()) + { + auto weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, + InferenceEngine::Layout::C, + {(size_t)numChannels}); + weights->allocate(); + std::vector ones(numChannels, 1); + weights->set(ones); + l.addConstantData("weights", weights); + l.getParameters()["channel_shared"] = false; + } + else + { + CV_Assert(numChannels == blobs[0].total()); + l.addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)numChannels}, InferenceEngine::Layout::C)); + l.getParameters()["channel_shared"] = blobs[0].total() == 1; + } + l.getParameters()["across_spatial"] = acrossSpatial; + return Ptr(new InfEngineBackendNode(l)); + } + else + { + InferenceEngine::Builder::GRNLayer ieLayer(name); + ieLayer.setBeta(epsilon); + + InferenceEngine::Builder::Layer l = ieLayer; + l.getParameters()["bias"] = epsilon; + + return Ptr(new InfEngineBackendNode(l)); + } +#else InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::LayerParams lp; @@ -307,6 +350,7 @@ public: ieLayer->params["bias"] = format("%f", epsilon); return Ptr(new InfEngineBackendNode(ieLayer)); } +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index ace567e182..bade509169 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -385,6 +385,11 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::PermuteLayer ieLayer(name); + ieLayer.setOrder(_order); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Permute"; @@ -397,6 +402,7 @@ public: ieLayer->params["order"] += format(",%zu", _order[i]); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 11fa7eaeab..bfcc1068e1 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -295,6 +295,48 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + if (type == MAX || type == AVE) + { + InferenceEngine::Builder::PoolingLayer ieLayer(name); + ieLayer.setKernel({kernel.height, kernel.width}); + ieLayer.setStrides({stride.height, stride.width}); + ieLayer.setPaddingsBegin({pad_t, pad_l}); + ieLayer.setPaddingsEnd({pad_b, pad_r}); + ieLayer.setPoolingType(type == MAX ? + InferenceEngine::Builder::PoolingLayer::PoolingType::MAX : + InferenceEngine::Builder::PoolingLayer::PoolingType::AVG); + ieLayer.setRoundingType(ceilMode ? + InferenceEngine::Builder::PoolingLayer::RoundingType::CEIL : + InferenceEngine::Builder::PoolingLayer::RoundingType::FLOOR); + ieLayer.setExcludePad(type == AVE && padMode == "SAME"); + + InferenceEngine::Builder::Layer l = ieLayer; + if (!padMode.empty()) + l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper"); + return Ptr(new InfEngineBackendNode(l)); + } + else if (type == ROI) + { + InferenceEngine::Builder::ROIPoolingLayer ieLayer(name); + ieLayer.setSpatialScale(spatialScale); + ieLayer.setPooled({pooledSize.height, pooledSize.width}); + ieLayer.setInputPorts(std::vector(2)); + return Ptr(new InfEngineBackendNode(ieLayer)); + } + else if (type == PSROI) + { + InferenceEngine::Builder::PSROIPoolingLayer ieLayer(name); + ieLayer.setSpatialScale(spatialScale); + ieLayer.setOutputDim(psRoiOutChannels); + ieLayer.setGroupSize(pooledSize.width); + ieLayer.setInputPorts(std::vector(2)); + return Ptr(new InfEngineBackendNode(ieLayer)); + } + else + CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); + return Ptr(); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.precision = InferenceEngine::Precision::FP32; @@ -353,6 +395,7 @@ public: CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index 93b39827d6..fb690d76ef 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -498,6 +498,58 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + if (_explicitSizes) + { + InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name); + + CV_Assert(_stepX == _stepY); + ieLayer.setStep(_stepX); + + CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], ""); + ieLayer.setOffset(_offsetsX[0]); + + ieLayer.setClip(_clip); + ieLayer.setFlip(false); // We already flipped aspect ratios. + + InferenceEngine::Builder::Layer l = ieLayer; + + CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty()); + CV_Assert(_boxWidths.size() == _boxHeights.size()); + l.getParameters()["width"] = _boxWidths; + l.getParameters()["height"] = _boxHeights; + l.getParameters()["variance"] = _variance; + return Ptr(new InfEngineBackendNode(l)); + } + else + { + InferenceEngine::Builder::PriorBoxLayer ieLayer(name); + + CV_Assert(!_explicitSizes); + + ieLayer.setMinSize(_minSize); + if (_maxSize > 0) + ieLayer.setMaxSize(_maxSize); + + CV_Assert(_stepX == _stepY); + ieLayer.setStep(_stepX); + + CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], ""); + ieLayer.setOffset(_offsetsX[0]); + + ieLayer.setClip(_clip); + ieLayer.setFlip(false); // We already flipped aspect ratios. + + InferenceEngine::Builder::Layer l = ieLayer; + if (!_aspectRatios.empty()) + { + l.getParameters()["aspect_ratio"] = _aspectRatios; + } + CV_Assert(!_variance.empty()); + l.getParameters()["variance"] = _variance; + return Ptr(new InfEngineBackendNode(l)); + } +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = _explicitSizes ? "PriorBoxClustered" : "PriorBox"; @@ -553,6 +605,7 @@ public: ieLayer->params["offset"] = format("%f", _offsetsX[0]); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp index f559ee40e2..6514ed3a5c 100644 --- a/modules/dnn/src/layers/proposal_layer.cpp +++ b/modules/dnn/src/layers/proposal_layer.cpp @@ -328,6 +328,28 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ProposalLayer ieLayer(name); + + ieLayer.setBaseSize(baseSize); + ieLayer.setFeatStride(featStride); + ieLayer.setMinSize(16); + ieLayer.setNMSThresh(nmsThreshold); + ieLayer.setPostNMSTopN(keepTopAfterNMS); + ieLayer.setPreNMSTopN(keepTopBeforeNMS); + + std::vector scalesVec(scales.size()); + for (int i = 0; i < scales.size(); ++i) + scalesVec[i] = scales.get(i); + ieLayer.setScale(scalesVec); + + std::vector ratiosVec(ratios.size()); + for (int i = 0; i < ratios.size(); ++i) + ratiosVec[i] = ratios.get(i); + ieLayer.setRatio(ratiosVec); + + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Proposal"; @@ -353,6 +375,7 @@ public: ieLayer->params["scale"] += format(",%f", scales.get(i)); } return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp index a98f690e65..3e42db5de1 100644 --- a/modules/dnn/src/layers/reorg_layer.cpp +++ b/modules/dnn/src/layers/reorg_layer.cpp @@ -181,6 +181,11 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ReorgYoloLayer ieLayer(name); + ieLayer.setStride(reorgStride); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "ReorgYolo"; @@ -188,6 +193,7 @@ public: std::shared_ptr ieLayer(new InferenceEngine::CNNLayer(lp)); ieLayer->params["stride"] = format("%d", reorgStride); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index 4109802a66..d6290456fa 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -203,6 +203,17 @@ public: return true; } + void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + std::vector outputs; + outputs_arr.getMatVector(outputs); + + CV_Assert(!outputs.empty()); + outShapes.resize(outputs.size()); + for (int i = 0; i < outputs.size(); ++i) + outShapes[i] = shape(outputs[i]); + } + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { std::vector inputs; @@ -218,8 +229,7 @@ public: void *dst_handle = outputs[i].handle(ACCESS_WRITE); if (src_handle != dst_handle) { - MatShape outShape = shape(outputs[i]); - UMat umat = srcBlob.reshape(1, (int)outShape.size(), &outShape[0]); + UMat umat = srcBlob.reshape(1, (int)outShapes[i].size(), &outShapes[i][0]); umat.copyTo(outputs[i]); } } @@ -250,6 +260,12 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ReshapeLayer ieLayer(name); + CV_Assert(outShapes.size() == 1); + ieLayer.setDims(outShapes[0]); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Reshape"; @@ -265,9 +281,13 @@ public: ieLayer->shape = std::vector(shapeSrc->dims.rbegin(), shapeSrc->dims.rend()); } return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } + +private: + std::vector outShapes; }; Ptr ReshapeLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index 6aa32150b6..03d806ad2c 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -163,6 +163,33 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer ieLayer(name); + ieLayer.setName(name); + if (interpolation == "nearest") + { + ieLayer.setType("Resample"); + ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST"); + ieLayer.getParameters()["antialias"] = false; + if (scaleWidth != scaleHeight) + CV_Error(Error::StsNotImplemented, "resample with sw != sh"); + ieLayer.getParameters()["factor"] = 1.0 / scaleWidth; + } + else if (interpolation == "bilinear") + { + ieLayer.setType("Interp"); + ieLayer.getParameters()["pad_beg"] = 0; + ieLayer.getParameters()["pad_end"] = 0; + ieLayer.getParameters()["align_corners"] = false; + } + else + CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation); + ieLayer.getParameters()["width"] = outWidth; + ieLayer.getParameters()["height"] = outHeight; + ieLayer.setInputPorts(std::vector(1)); + ieLayer.setOutputPorts(std::vector(1)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.precision = InferenceEngine::Precision::FP32; @@ -187,6 +214,7 @@ public: ieLayer->params["width"] = cv::format("%d", outWidth); ieLayer->params["height"] = cv::format("%d", outHeight); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } @@ -247,6 +275,18 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer ieLayer(name); + ieLayer.setName(name); + ieLayer.setType("Interp"); + ieLayer.getParameters()["pad_beg"] = 0; + ieLayer.getParameters()["pad_end"] = 0; + ieLayer.getParameters()["width"] = outWidth; + ieLayer.getParameters()["height"] = outHeight; + ieLayer.setInputPorts(std::vector(1)); + ieLayer.setOutputPorts(std::vector(1)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Interp"; @@ -256,6 +296,7 @@ public: ieLayer->params["pad_beg"] = "0"; ieLayer->params["pad_end"] = "0"; return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index b217632584..a11fd379a2 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -197,6 +197,29 @@ public: virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::ScaleShiftLayer ieLayer(name); + + CV_Assert(!blobs.empty()); + const size_t numChannels = blobs[0].total(); + if (hasWeights) + { + ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C)); + } + else + { + auto weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, + {numChannels}); + weights->allocate(); + + std::vector ones(numChannels, 1); + weights->set(ones); + ieLayer.setWeights(weights); + } + if (hasBias) + ieLayer.setBiases(wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "ScaleShift"; @@ -223,6 +246,7 @@ public: ieLayer->_biases = wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 66f9aea440..0821979376 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -110,8 +110,15 @@ public: virtual bool supportBackend(int backendId) CV_OVERRIDE { - return backendId == DNN_BACKEND_OPENCV || - (backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1 && sliceRanges[0].size() == 4); +#ifdef HAVE_INF_ENGINE + if (backendId == DNN_BACKEND_INFERENCE_ENGINE) + { + return INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) && + sliceRanges.size() == 1 && sliceRanges[0].size() == 4; + } + else +#endif + return backendId == DNN_BACKEND_OPENCV; } bool getMemoryShapes(const std::vector &inputs, @@ -254,9 +261,10 @@ public: } } +#ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { -#ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::LayerParams lp; lp.name = name; @@ -286,10 +294,11 @@ public: ieLayer->dim.push_back(sliceRanges[0][i].end - sliceRanges[0][i].start); } return Ptr(new InfEngineBackendNode(ieLayer)); - -#endif // HAVE_INF_ENGINE +#else return Ptr(); +#endif // IE < R5 } +#endif }; Ptr SliceLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index ab4fd6d7ce..cdd91059ed 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -326,6 +326,13 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); + + InferenceEngine::Builder::SoftMaxLayer ieLayer(name); + ieLayer.setAxis(clamp(axisRaw, input->dims.size())); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::LayerParams lp; @@ -335,6 +342,7 @@ public: std::shared_ptr ieLayer(new InferenceEngine::SoftMaxLayer(lp)); ieLayer->axis = clamp(axisRaw, input->dims.size()); return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE return Ptr(); } diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 7beec8a1a0..98de907b9e 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -18,6 +18,10 @@ namespace cv { namespace dnn { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) +InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::Builder::Layer& _layer) + : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {} +#else InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& _layer) : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {} @@ -40,6 +44,7 @@ void InfEngineBackendNode::connect(std::vector >& inputs, layer->outData[0] = dataPtr; dataPtr->creatorLayer = InferenceEngine::CNNLayerWeakPtr(layer); } +#endif static std::vector > infEngineWrappers(const std::vector >& ptrs) @@ -54,6 +59,129 @@ infEngineWrappers(const std::vector >& ptrs) return wrappers; } +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + +InfEngineBackendNet::InfEngineBackendNet() : netBuilder("") +{ + hasNetOwner = false; + targetDevice = InferenceEngine::TargetDevice::eCPU; +} + +InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net) +{ + hasNetOwner = true; + targetDevice = InferenceEngine::TargetDevice::eCPU; +} + +void InfEngineBackendNet::connect(const std::vector >& inputs, + const std::vector >& outputs, + const std::string& layerName) +{ + std::vector > inpWrappers = infEngineWrappers(inputs); + std::map::iterator it = layers.find(layerName); + CV_Assert(it != layers.end()); + + const int layerId = it->second; + for (int i = 0; i < inpWrappers.size(); ++i) + { + const auto& inp = inpWrappers[i]; + const std::string& inpName = inp->dataPtr->name; + int inpId; + it = layers.find(inpName); + if (it == layers.end()) + { + InferenceEngine::Builder::InputLayer inpLayer(inpName); + + std::vector shape(inp->blob->dims()); + std::reverse(shape.begin(), shape.end()); + + inpLayer.setPort(InferenceEngine::Port(shape)); + inpId = netBuilder.addLayer(inpLayer); + + layers.insert({inpName, inpId}); + } + else + inpId = it->second; + + netBuilder.connect(inpId, {layerId, i}); + unconnectedLayersIds.erase(inpId); + } + CV_Assert(!outputs.empty()); + InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]); + dataPtr->name = layerName; +} + +void InfEngineBackendNet::init(int targetId) +{ + if (!hasNetOwner) + { + CV_Assert(!unconnectedLayersIds.empty()); + for (int id : unconnectedLayersIds) + { + InferenceEngine::Builder::OutputLayer outLayer("myconv1"); + netBuilder.addLayer({id}, outLayer); + } + cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build())); + } + + switch (targetId) + { + case DNN_TARGET_CPU: + targetDevice = InferenceEngine::TargetDevice::eCPU; + break; + case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16: + targetDevice = InferenceEngine::TargetDevice::eGPU; + break; + case DNN_TARGET_MYRIAD: + targetDevice = InferenceEngine::TargetDevice::eMYRIAD; + break; + case DNN_TARGET_FPGA: + targetDevice = InferenceEngine::TargetDevice::eFPGA; + break; + default: + CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId)); + } + + for (const auto& name : requestedOutputs) + { + cnn.addOutput(name); + } + + for (const auto& it : cnn.getInputsInfo()) + { + const std::string& name = it.first; + auto blobIt = allBlobs.find(name); + CV_Assert(blobIt != allBlobs.end()); + inpBlobs[name] = blobIt->second; + it.second->setPrecision(blobIt->second->precision()); + } + for (const auto& it : cnn.getOutputsInfo()) + { + const std::string& name = it.first; + auto blobIt = allBlobs.find(name); + CV_Assert(blobIt != allBlobs.end()); + outBlobs[name] = blobIt->second; + it.second->setPrecision(blobIt->second->precision()); // Should be always FP32 + } + + initPlugin(cnn); +} + +void InfEngineBackendNet::addLayer(const InferenceEngine::Builder::Layer& layer) +{ + int id = netBuilder.addLayer(layer); + const std::string& layerName = layer.getName(); + CV_Assert(layers.insert({layerName, id}).second); + unconnectedLayersIds.insert(id); +} + +void InfEngineBackendNet::addOutput(const std::string& name) +{ + requestedOutputs.push_back(name); +} + +#endif // IE >= R5 + static InferenceEngine::Layout estimateLayout(const Mat& m) { if (m.dims == 4) @@ -148,6 +276,7 @@ void InfEngineBackendWrapper::setHostDirty() } +#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) InfEngineBackendNet::InfEngineBackendNet() { targetDevice = InferenceEngine::TargetDevice::eCPU; @@ -491,6 +620,8 @@ void InfEngineBackendNet::init(int targetId) initPlugin(*this); } +#endif // IE < R5 + static std::map sharedPlugins; void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) @@ -566,7 +697,11 @@ void InfEngineBackendNet::addBlobs(const std::vector >& ptrs auto wrappers = infEngineWrappers(ptrs); for (const auto& wrapper : wrappers) { - allBlobs.insert({wrapper->dataPtr->name, wrapper->blob}); + std::string name = wrapper->dataPtr->name; +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + name = name.empty() ? "id1" : name; // TODO: drop the magic input name. +#endif + allBlobs.insert({name, wrapper->blob}); } } diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index 118e525d97..a224767f8d 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -35,6 +35,11 @@ #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000)) #define INF_ENGINE_VER_MAJOR_GE(ver) (((INF_ENGINE_RELEASE) / 10000) >= ((ver) / 10000)) +#define INF_ENGINE_VER_MAJOR_LT(ver) (((INF_ENGINE_RELEASE) / 10000) < ((ver) / 10000)) + +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) +#include +#endif #endif // HAVE_INF_ENGINE @@ -42,6 +47,7 @@ namespace cv { namespace dnn { #ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2018R5) class InfEngineBackendNet : public InferenceEngine::ICNNNetwork { public: @@ -146,17 +152,75 @@ private: void initPlugin(InferenceEngine::ICNNNetwork& net); }; +#else // IE < R5 + +class InfEngineBackendNet +{ +public: + InfEngineBackendNet(); + + InfEngineBackendNet(InferenceEngine::CNNNetwork& net); + + void addLayer(const InferenceEngine::Builder::Layer& layer); + + void addOutput(const std::string& name); + + void connect(const std::vector >& inputs, + const std::vector >& outputs, + const std::string& layerName); + + bool isInitialized(); + + void init(int targetId); + + void forward(); + + void initPlugin(InferenceEngine::ICNNNetwork& net); + + void addBlobs(const std::vector >& ptrs); + +private: + InferenceEngine::Builder::Network netBuilder; + + InferenceEngine::InferenceEnginePluginPtr enginePtr; + InferenceEngine::InferencePlugin plugin; + InferenceEngine::ExecutableNetwork netExec; + InferenceEngine::InferRequest infRequest; + InferenceEngine::BlobMap allBlobs; + InferenceEngine::BlobMap inpBlobs; + InferenceEngine::BlobMap outBlobs; + InferenceEngine::TargetDevice targetDevice; + + InferenceEngine::CNNNetwork cnn; + bool hasNetOwner; + + std::map layers; + std::vector requestedOutputs; + + std::set unconnectedLayersIds; +}; +#endif // IE < R5 + class InfEngineBackendNode : public BackendNode { public: +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InfEngineBackendNode(const InferenceEngine::Builder::Layer& layer); +#else InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& layer); +#endif void connect(std::vector >& inputs, std::vector >& outputs); - InferenceEngine::CNNLayerPtr layer; // Inference Engine network object that allows to obtain the outputs of this layer. +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer layer; Ptr net; +#else + InferenceEngine::CNNLayerPtr layer; + Ptr net; +#endif }; class InfEngineBackendWrapper : public BackendWrapper diff --git a/modules/dnn/src/torch/THGeneral.cpp b/modules/dnn/src/torch/THGeneral.cpp index 8a52745770..0c27edc6fb 100644 --- a/modules/dnn/src/torch/THGeneral.cpp +++ b/modules/dnn/src/torch/THGeneral.cpp @@ -1,10 +1,2 @@ #include "../precomp.hpp" - -#if defined(TH_DISABLE_HEAP_TRACKING) -#elif (defined(__unix) || defined(_WIN32)) -#include -#elif defined(__APPLE__) -#include -#endif - #include "THGeneral.h" diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index 3a64d6485b..eef4f6ba79 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -180,7 +180,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow) throw SkipTestException(""); Mat sample = imread(findDataFile("dnn/street.png", false)); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); - float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 0.0; + float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 2e-5; float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0; processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt", inp, "detection_out", "", l1, lInf, 0.25); @@ -288,7 +288,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16) Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false); // Output image has values in range [-143.526, 148.539]. float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.3 : 4e-5; - float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.0 : 2e-3; + float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.28 : 2e-3; processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf); } diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index 5d41b4b916..d7c14f2714 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -306,7 +306,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc) // batch size 1 testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff); -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_MYRIAD) #endif // batch size 2 diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index 468953fe7e..9cbfb0c402 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -163,7 +163,7 @@ TEST_P(Deconvolution, Accuracy) bool hasBias = get<6>(GetParam()); Backend backendId = get<0>(get<7>(GetParam())); Target targetId = get<1>(get<7>(GetParam())); - if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU && + if (backendId == DNN_BACKEND_INFERENCE_ENGINE && (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_MYRIAD) && dilation.width == 2 && dilation.height == 2) throw SkipTestException(""); #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 @@ -466,6 +466,7 @@ void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId) pool.set("stride_w", 2); pool.set("stride_h", 2); pool.type = "Pooling"; + pool.name = "ave_pool"; Net net; int poolId = net.addLayer(pool.name, pool.type, pool); diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 4ccefd28a9..62e625f03c 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -295,10 +295,6 @@ TEST_P(Test_Caffe_layers, Eltwise) { if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) throw SkipTestException(""); -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) - throw SkipTestException("Test is disabled for OpenVINO 2018R5"); -#endif testLayerUsingCaffeModels("layer_eltwise"); } diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index deccbfb0eb..acdd66631c 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -351,6 +351,10 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR) l1 = 0.009; lInf = 0.035; } + else if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU) { + l1 = 4.5e-5; + lInf = 1.9e-4; + } testONNXModels("LResNet100E_IR", pb, l1, lInf); } @@ -366,6 +370,10 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus) l1 = 0.021; lInf = 0.034; } + else if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_CPU || target == DNN_TARGET_OPENCL)) { + l1 = 2.4e-4; + lInf = 6e-4; + } testONNXModels("emotion_ferplus", pb, l1, lInf); } @@ -389,7 +397,7 @@ TEST_P(Test_ONNX_nets, Inception_v1) { #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) - throw SkipTestException(""); + throw SkipTestException("Test is disabled for OpenVINO 2018R5"); #endif testONNXModels("inception_v1", pb); } diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index ce4997cd4e..b20b166551 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -40,7 +40,7 @@ TEST(Test_TensorFlow, read_inception) ASSERT_TRUE(!sample.empty()); Mat input; resize(sample, input, Size(224, 224)); - input -= 128; // mean sub + input -= Scalar::all(117); // mean sub Mat inputBlob = blobFromImage(input); @@ -351,8 +351,8 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD) Mat out = net.forward(); Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy")); - float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1e-5; - float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0098 : 1e-3; + float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1.5e-5; + float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 1e-3; normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff); } @@ -366,6 +366,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN) (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)) throw SkipTestException(""); + double scoresDiff = backend == DNN_BACKEND_INFERENCE_ENGINE ? 2.9e-5 : 1e-5; for (int i = 0; i < 2; ++i) { std::string proto = findDataFile("dnn/" + names[i] + ".pbtxt", false); @@ -381,7 +382,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN) Mat out = net.forward(); Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/" + names[i] + ".detection_out.npy")); - normAssertDetections(ref, out, names[i].c_str(), 0.3); + normAssertDetections(ref, out, names[i].c_str(), 0.3, scoresDiff); } } @@ -406,7 +407,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN) net.setInput(blob); Mat out = net.forward(); - double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : default_l1; + double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 1.1e-5; double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.021 : default_lInf; normAssertDetections(ref, out, "", 0.4, scoreDiff, iouDiff); } @@ -568,10 +569,6 @@ TEST_P(Test_TensorFlow_layers, slice) if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) throw SkipTestException(""); -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) - throw SkipTestException(""); -#endif runTensorFlowNet("slice_4d"); } diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index c63cf26e45..046bd65b86 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -260,6 +260,11 @@ TEST_P(Test_Torch_layers, run_paralel) TEST_P(Test_Torch_layers, net_residual) { +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 + if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || + target == DNN_TARGET_OPENCL_FP16)) + throw SkipTestException("Test is disabled for OpenVINO 2018R5"); +#endif runTorchNet("net_residual", "", false, true); } @@ -390,10 +395,6 @@ TEST_P(Test_Torch_nets, ENet_accuracy) // -model models/instance_norm/feathers.t7 TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) - throw SkipTestException(""); -#endif checkBackend(); std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7", "dnn/fast_neural_style_instance_norm_feathers.t7"}; diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp index 9076c23545..f1e8b63799 100644 --- a/modules/features2d/src/blobdetector.cpp +++ b/modules/features2d/src/blobdetector.cpp @@ -197,8 +197,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag centers.clear(); std::vector < std::vector > contours; - Mat tmpBinaryImage = binaryImage.clone(); - findContours(tmpBinaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE); + findContours(binaryImage, contours, RETR_LIST, CHAIN_APPROX_NONE); #ifdef DEBUG_BLOB_DETECTOR // Mat keypointsImage; @@ -214,7 +213,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag { Center center; center.confidence = 1; - Moments moms = moments(Mat(contours[contourIdx])); + Moments moms = moments(contours[contourIdx]); if (params.filterByArea) { double area = moms.m00; @@ -225,7 +224,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag if (params.filterByCircularity) { double area = moms.m00; - double perimeter = arcLength(Mat(contours[contourIdx]), true); + double perimeter = arcLength(contours[contourIdx], true); double ratio = 4 * CV_PI * area / (perimeter * perimeter); if (ratio < params.minCircularity || ratio >= params.maxCircularity) continue; @@ -261,9 +260,9 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag if (params.filterByConvexity) { std::vector < Point > hull; - convexHull(Mat(contours[contourIdx]), hull); - double area = contourArea(Mat(contours[contourIdx])); - double hullArea = contourArea(Mat(hull)); + convexHull(contours[contourIdx], hull); + double area = contourArea(contours[contourIdx]); + double hullArea = contourArea(hull); if (fabs(hullArea) < DBL_EPSILON) continue; double ratio = area / hullArea; diff --git a/modules/imgproc/perf/perf_contours.cpp b/modules/imgproc/perf/perf_contours.cpp index d3a70cfdd7..bc8b530016 100644 --- a/modules/imgproc/perf/perf_contours.cpp +++ b/modules/imgproc/perf/perf_contours.cpp @@ -84,4 +84,26 @@ PERF_TEST_P(TestFindContoursFF, findContours, SANITY_CHECK_NOTHING(); } +typedef TestBaseWithParam< tuple > TestBoundingRect; + +PERF_TEST_P(TestBoundingRect, BoundingRect, + Combine( + testing::Values(CV_32S, CV_32F), // points type + Values(400, 511, 1000, 10000, 100000) // points count + ) +) + +{ + int ptType = get<0>(GetParam()); + int n = get<1>(GetParam()); + + Mat pts(n, 2, ptType); + declare.in(pts, WARMUP_RNG); + + cv::Rect rect; + TEST_CYCLE() rect = boundingRect(pts); + + SANITY_CHECK_NOTHING(); +} + } } // namespace diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp index 4b2ba97148..d64c49e0a9 100644 --- a/modules/imgproc/perf/perf_integral.cpp +++ b/modules/imgproc/perf/perf_integral.cpp @@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam Size_MatType_OutMatD PERF_TEST_P(Size_MatType_OutMatDepth, integral, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC3, CV_8UC4), testing::Values(CV_32S, CV_32F, CV_64F) ) ) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 550fdffdb9..538f158b6e 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs } // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16); + bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); dx1 = std::max(anchor.x - roi.x, 0); dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp index 0878dc456f..a1a75a29e1 100644 --- a/modules/imgproc/src/fixedpoint.inl.hpp +++ b/modules/imgproc/src/fixedpoint.inl.hpp @@ -11,16 +11,6 @@ #include "opencv2/core/softfloat.hpp" -#ifndef CV_ALWAYS_INLINE - #if defined(__GNUC__) && (__GNUC__ > 3 ||(__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) - #define CV_ALWAYS_INLINE inline __attribute__((always_inline)) - #elif defined(_MSC_VER) - #define CV_ALWAYS_INLINE __forceinline - #else - #define CV_ALWAYS_INLINE inline - #endif -#endif - namespace { diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp index 52dc239bc6..5690553b70 100644 --- a/modules/imgproc/src/morph.cpp +++ b/modules/imgproc/src/morph.cpp @@ -45,6 +45,7 @@ #include "opencl_kernels_imgproc.hpp" #include #include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" #include /****************************************************************************************\ @@ -97,73 +98,65 @@ struct MorphNoVec int operator()(uchar**, int, uchar*, int) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD -template struct MorphRowIVec +template struct MorphRowVec { - enum { ESZ = VecUpdate::ESZ }; - - MorphRowIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar* src, uchar* dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - cn *= ESZ; int i, k, _ksize = ksize*cn; - width = (width & -4)*cn; + width *= cn; VecUpdate updateOp; - for( i = 0; i <= width - 16; i += 16 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) { - __m128i s = _mm_loadu_si128((const __m128i*)(src + i)); - for( k = cn; k < _ksize; k += cn ) + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); + vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); + vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); + for (k = cn; k < _ksize; k += cn) { - __m128i x = _mm_loadu_si128((const __m128i*)(src + i + k)); - s = updateOp(s, x); + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); + s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); } - _mm_storeu_si128((__m128i*)(dst + i), s); + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + v_store((stype*)dst + i + 2*vtype::nlanes, s2); + v_store((stype*)dst + i + 3*vtype::nlanes, s3); } - - for( ; i < width; i += 4 ) + if( i <= width - 2*vtype::nlanes ) { - __m128i s = _mm_cvtsi32_si128(*(const int*)(src + i)); + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); for( k = cn; k < _ksize; k += cn ) { - __m128i x = _mm_cvtsi32_si128(*(const int*)(src + i + k)); - s = updateOp(s, x); + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); } - *(int*)(dst + i) = _mm_cvtsi128_si32(s); + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; } - - return i/ESZ; - } - - int ksize, anchor; -}; - - -template struct MorphRowFVec -{ - MorphRowFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar* src, uchar* dst, int width, int cn) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int i, k, _ksize = ksize*cn; - width = (width & -4)*cn; - VecUpdate updateOp; - - for( i = 0; i < width; i += 4 ) + if( i <= width - vtype::nlanes ) { - __m128 s = _mm_loadu_ps((const float*)src + i); + vtype s = vx_load((const stype*)src + i); for( k = cn; k < _ksize; k += cn ) - { - __m128 x = _mm_loadu_ps((const float*)src + i + k); - s = updateOp(s, x); - } - _mm_storeu_ps((float*)dst + i, s); + s = updateOp(s, vx_load((const stype*)src + i + k)); + v_store((stype*)dst + i, s); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s = vx_load_low((const stype*)src + i); + for( k = cn; k < _ksize; k += cn ) + s = updateOp(s, vx_load_low((const stype*)src + i + k)); + v_store_low((stype*)dst + i, s); + i += vtype::nlanes/2; } return i; @@ -173,230 +166,156 @@ template struct MorphRowFVec }; -template struct MorphColumnIVec +template struct MorphColumnVec { - enum { ESZ = VecUpdate::ESZ }; - - MorphColumnIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** src, uchar* dst, int dststep, int count, int width) const + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int i = 0, k, _ksize = ksize; - width *= ESZ; VecUpdate updateOp; for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)src[i] & 15) == 0 ); + CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); + + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; + dststep /= sizeof(dst[0]); for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) { - for( i = 0; i <= width - 32; i += 32 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) { - const uchar* sptr = src[1] + i; - __m128i s0 = _mm_load_si128((const __m128i*)sptr); - __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); for( k = 2; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); } sptr = src[0] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - _mm_storeu_si128((__m128i*)(dst + i), updateOp(s0, x0)); - _mm_storeu_si128((__m128i*)(dst + i + 16), updateOp(s1, x1)); + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - _mm_storeu_si128((__m128i*)(dst + dststep + i), updateOp(s0, x0)); - _mm_storeu_si128((__m128i*)(dst + dststep + i + 16), updateOp(s1, x1)); + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); } - - for( ; i <= width - 8; i += 8 ) + if( i <= width - 2*vtype::nlanes ) { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[1] + i)), x0; + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); for( k = 2; k < _ksize; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); - } - - x0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)); - _mm_storel_epi64((__m128i*)(dst + i), updateOp(s0, x0)); - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - _mm_storel_epi64((__m128i*)(dst + dststep + i), updateOp(s0, x0)); - } - } - - for( ; count > 0; count--, dst += dststep, src++ ) - { - for( i = 0; i <= width - 32; i += 32 ) - { - const uchar* sptr = src[0] + i; - __m128i s0 = _mm_load_si128((const __m128i*)sptr); - __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; - - for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); } - _mm_storeu_si128((__m128i*)(dst + i), s0); - _mm_storeu_si128((__m128i*)(dst + i + 16), s1); - } - for( ; i <= width - 8; i += 8 ) - { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0; + sptr = src[0] + i; + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - for( k = 1; k < _ksize; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); - } - _mm_storel_epi64((__m128i*)(dst + i), s0); + sptr = src[k] + i; + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + i += 2*vtype::nlanes; } - } - - return i/ESZ; - } - - int ksize, anchor; -}; - - -template struct MorphColumnFVec -{ - MorphColumnFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int i = 0, k, _ksize = ksize; - VecUpdate updateOp; - - for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & 15) == 0 ); - - const float** src = (const float**)_src; - float* dst = (float*)_dst; - dststep /= sizeof(dst[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) - { - for( i = 0; i <= width - 16; i += 16 ) + if( i <= width - vtype::nlanes ) { - const float* sptr = src[1] + i; - __m128 s0 = _mm_load_ps(sptr); - __m128 s1 = _mm_load_ps(sptr + 4); - __m128 s2 = _mm_load_ps(sptr + 8); - __m128 s3 = _mm_load_ps(sptr + 12); - __m128 x0, x1, x2, x3; + vtype s0 = vx_load_aligned(src[1] + i); for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); - } - - sptr = src[0] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - _mm_storeu_ps(dst + i, updateOp(s0, x0)); - _mm_storeu_ps(dst + i + 4, updateOp(s1, x1)); - _mm_storeu_ps(dst + i + 8, updateOp(s2, x2)); - _mm_storeu_ps(dst + i + 12, updateOp(s3, x3)); + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0)); - _mm_storeu_ps(dst + dststep + i + 4, updateOp(s1, x1)); - _mm_storeu_ps(dst + dststep + i + 8, updateOp(s2, x2)); - _mm_storeu_ps(dst + dststep + i + 12, updateOp(s3, x3)); + v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); + i += vtype::nlanes; } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - vtype::nlanes/2 ) { - __m128 s0 = _mm_load_ps(src[1] + i), x0; + vtype s0 = vx_load_low(src[1] + i); for( k = 2; k < _ksize; k++ ) - { - x0 = _mm_load_ps(src[k] + i); - s0 = updateOp(s0, x0); - } + s0 = updateOp(s0, vx_load_low(src[k] + i)); - x0 = _mm_load_ps(src[0] + i); - _mm_storeu_ps(dst + i, updateOp(s0, x0)); - x0 = _mm_load_ps(src[k] + i); - _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0)); + v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); + v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); + i += vtype::nlanes/2; } } for( ; count > 0; count--, dst += dststep, src++ ) { - for( i = 0; i <= width - 16; i += 16 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) { - const float* sptr = src[0] + i; - __m128 s0 = _mm_load_ps(sptr); - __m128 s1 = _mm_load_ps(sptr + 4); - __m128 s2 = _mm_load_ps(sptr + 8); - __m128 s3 = _mm_load_ps(sptr + 12); - __m128 x0, x1, x2, x3; + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); } - - for( i = 0; i <= width - 4; i += 4 ) + if( i <= width - 2*vtype::nlanes ) { - __m128 s0 = _mm_load_ps(src[0] + i), x0; + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + for( k = 1; k < _ksize; k++ ) { - x0 = _mm_load_ps(src[k] + i); - s0 = updateOp(s0, x0); + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s0 = vx_load_aligned(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s0 = vx_load_low(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; } } @@ -407,185 +326,109 @@ template struct MorphColumnFVec }; -template struct MorphIVec +template struct MorphVec { - enum { ESZ = VecUpdate::ESZ }; - - int operator()(uchar** src, int nz, uchar* dst, int width) const + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + int operator()(uchar** _src, int nz, uchar* _dst, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; int i, k; - width *= ESZ; VecUpdate updateOp; - for( i = 0; i <= width - 32; i += 32 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) { - const uchar* sptr = src[0] + i; - __m128i s0 = _mm_loadu_si128((const __m128i*)sptr); - __m128i s1 = _mm_loadu_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; - + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); + vtype s2 = vx_load(sptr + 2*vtype::nlanes); + vtype s3 = vx_load(sptr + 3*vtype::nlanes); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; - x0 = _mm_loadu_si128((const __m128i*)sptr); - x1 = _mm_loadu_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - } - _mm_storeu_si128((__m128i*)(dst + i), s0); - _mm_storeu_si128((__m128i*)(dst + i + 16), s1); - } - - for( ; i <= width - 8; i += 8 ) - { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0; - - for( k = 1; k < nz; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); } - _mm_storel_epi64((__m128i*)(dst + i), s0); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); } - - return i/ESZ; - } -}; - - -template struct MorphFVec -{ - int operator()(uchar** _src, int nz, uchar* _dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - float* dst = (float*)_dst; - int i, k; - VecUpdate updateOp; - - for( i = 0; i <= width - 16; i += 16 ) + if( i <= width - 2*vtype::nlanes ) { - const float* sptr = src[0] + i; - __m128 s0 = _mm_loadu_ps(sptr); - __m128 s1 = _mm_loadu_ps(sptr + 4); - __m128 s2 = _mm_loadu_ps(sptr + 8); - __m128 s3 = _mm_loadu_ps(sptr + 12); - __m128 x0, x1, x2, x3; - + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; - x0 = _mm_loadu_ps(sptr); - x1 = _mm_loadu_ps(sptr + 4); - x2 = _mm_loadu_ps(sptr + 8); - x3 = _mm_loadu_ps(sptr + 12); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - vtype::nlanes ) { - __m128 s0 = _mm_loadu_ps(src[0] + i), x0; - + vtype s0 = vx_load(src[0] + i); for( k = 1; k < nz; k++ ) - { - x0 = _mm_loadu_ps(src[k] + i); - s0 = updateOp(s0, x0); - } - _mm_storeu_ps(dst + i, s0); + s0 = updateOp(s0, vx_load(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; } - - for( ; i < width; i++ ) + if( i <= width - vtype::nlanes/2 ) { - __m128 s0 = _mm_load_ss(src[0] + i), x0; - + vtype s0 = vx_load_low(src[0] + i); for( k = 1; k < nz; k++ ) - { - x0 = _mm_load_ss(src[k] + i); - s0 = updateOp(s0, x0); - } - _mm_store_ss(dst + i, s0); + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; } - return i; } }; -struct VMin8u -{ - enum { ESZ = 1 }; - __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); } -}; -struct VMax8u -{ - enum { ESZ = 1 }; - __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); } -}; -struct VMin16u -{ - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); } -}; -struct VMax16u -{ - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_adds_epu16(_mm_subs_epu16(a,b), b); } -}; -struct VMin16s +template struct VMin { - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_min_epi16(a, b); } + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } }; -struct VMax16s +template struct VMax { - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_max_epi16(a, b); } + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } }; -struct VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }}; -struct VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }}; - -typedef MorphRowIVec ErodeRowVec8u; -typedef MorphRowIVec DilateRowVec8u; -typedef MorphRowIVec ErodeRowVec16u; -typedef MorphRowIVec DilateRowVec16u; -typedef MorphRowIVec ErodeRowVec16s; -typedef MorphRowIVec DilateRowVec16s; -typedef MorphRowFVec ErodeRowVec32f; -typedef MorphRowFVec DilateRowVec32f; - -typedef MorphColumnIVec ErodeColumnVec8u; -typedef MorphColumnIVec DilateColumnVec8u; -typedef MorphColumnIVec ErodeColumnVec16u; -typedef MorphColumnIVec DilateColumnVec16u; -typedef MorphColumnIVec ErodeColumnVec16s; -typedef MorphColumnIVec DilateColumnVec16s; -typedef MorphColumnFVec ErodeColumnVec32f; -typedef MorphColumnFVec DilateColumnVec32f; - -typedef MorphIVec ErodeVec8u; -typedef MorphIVec DilateVec8u; -typedef MorphIVec ErodeVec16u; -typedef MorphIVec DilateVec16u; -typedef MorphIVec ErodeVec16s; -typedef MorphIVec DilateVec16s; -typedef MorphFVec ErodeVec32f; -typedef MorphFVec DilateVec32f; + +typedef MorphRowVec > ErodeRowVec8u; +typedef MorphRowVec > DilateRowVec8u; +typedef MorphRowVec > ErodeRowVec16u; +typedef MorphRowVec > DilateRowVec16u; +typedef MorphRowVec > ErodeRowVec16s; +typedef MorphRowVec > DilateRowVec16s; +typedef MorphRowVec > ErodeRowVec32f; +typedef MorphRowVec > DilateRowVec32f; + +typedef MorphColumnVec > ErodeColumnVec8u; +typedef MorphColumnVec > DilateColumnVec8u; +typedef MorphColumnVec > ErodeColumnVec16u; +typedef MorphColumnVec > DilateColumnVec16u; +typedef MorphColumnVec > ErodeColumnVec16s; +typedef MorphColumnVec > DilateColumnVec16s; +typedef MorphColumnVec > ErodeColumnVec32f; +typedef MorphColumnVec > DilateColumnVec32f; + +typedef MorphVec > ErodeVec8u; +typedef MorphVec > DilateVec8u; +typedef MorphVec > ErodeVec16u; +typedef MorphVec > DilateVec16u; +typedef MorphVec > ErodeVec16s; +typedef MorphVec > DilateVec16s; +typedef MorphVec > ErodeVec32f; +typedef MorphVec > DilateVec32f; #else diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp index d505fde4fc..436c74eade 100644 --- a/modules/imgproc/src/shapedescr.cpp +++ b/modules/imgproc/src/shapedescr.cpp @@ -39,6 +39,8 @@ // //M*/ #include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" + namespace cv { @@ -746,109 +748,161 @@ static Rect pointSetBoundingRect( const Mat& points ) if( npoints == 0 ) return Rect(); - const Point* pts = points.ptr(); - Point pt = pts[0]; +#if CV_SIMD + const int64_t* pts = points.ptr(); -#if CV_SSE4_2 - if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) + if( !is_float ) { - if( !is_float ) + v_int32 minval, maxval; + minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y + for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 ) { - __m128i minval, maxval; - minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y - - for( i = 1; i < npoints; i++ ) - { - __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]); - minval = _mm_min_epi32(ptXY, minval); - maxval = _mm_max_epi32(ptXY, maxval); - } - xmin = _mm_cvtsi128_si32(minval); - ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); - xmax = _mm_cvtsi128_si32(maxval); - ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); + v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i)); + minval = v_min(ptXY2, minval); + maxval = v_max(ptXY2, maxval); } - else + minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); + if( i <= npoints - v_int32::nlanes/4 ) { - __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); - minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); - - for( i = 1; i < npoints; i++ ) + v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); + minval = v_min(ptXY, minval); + maxval = v_max(ptXY, maxval); + i += v_int64::nlanes/2; + } + for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + { + minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); + } + xmin = minval.get0(); + xmax = maxval.get0(); + ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0(); + ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0(); +#if CV_SIMD_WIDTH > 16 + if( i < npoints ) + { + v_int32x4 minval2, maxval2; + minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + for( i++; i < npoints; i++ ) { - ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]); - - minvalf = _mm_min_ps(minvalf, ptXY); - maxvalf = _mm_max_ps(maxvalf, ptXY); + v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + minval2 = v_min(ptXY, minval2); + maxval2 = v_max(ptXY, maxval2); } - - float xyminf[2], xymaxf[2]; - _mm_storel_pi((__m64*)xyminf, minvalf); - _mm_storel_pi((__m64*)xymaxf, maxvalf); - xmin = cvFloor(xyminf[0]); - ymin = cvFloor(xyminf[1]); - xmax = cvFloor(xymaxf[0]); - ymax = cvFloor(xymaxf[1]); + xmin = min(xmin, minval2.get0()); + xmax = max(xmax, maxval2.get0()); + ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()); + ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()); } +#endif } else -#endif { - if( !is_float ) + v_float32 minval, maxval; + minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y + for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 ) { - xmin = xmax = pt.x; - ymin = ymax = pt.y; - - for( i = 1; i < npoints; i++ ) + v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i)); + minval = v_min(ptXY2, minval); + maxval = v_max(ptXY2, maxval); + } + minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); + if( i <= npoints - v_float32::nlanes/4 ) + { + v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); + minval = v_min(ptXY, minval); + maxval = v_max(ptXY, maxval); + i += v_float32::nlanes/4; + } + for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + { + minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); + } + xmin = cvFloor(minval.get0()); + xmax = cvFloor(maxval.get0()); + ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0()); + ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0()); +#if CV_SIMD_WIDTH > 16 + if( i < npoints ) + { + v_float32x4 minval2, maxval2; + minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + for( i++; i < npoints; i++ ) { - pt = pts[i]; + v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + minval2 = v_min(ptXY, minval2); + maxval2 = v_max(ptXY, maxval2); + } + xmin = min(xmin, cvFloor(minval2.get0())); + xmax = max(xmax, cvFloor(maxval2.get0())); + ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0())); + ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0())); + } +#endif + } +#else + const Point* pts = points.ptr(); + Point pt = pts[0]; - if( xmin > pt.x ) - xmin = pt.x; + if( !is_float ) + { + xmin = xmax = pt.x; + ymin = ymax = pt.y; - if( xmax < pt.x ) - xmax = pt.x; + for( i = 1; i < npoints; i++ ) + { + pt = pts[i]; - if( ymin > pt.y ) - ymin = pt.y; + if( xmin > pt.x ) + xmin = pt.x; - if( ymax < pt.y ) - ymax = pt.y; - } - } - else - { - Cv32suf v; - // init values - xmin = xmax = CV_TOGGLE_FLT(pt.x); - ymin = ymax = CV_TOGGLE_FLT(pt.y); + if( xmax < pt.x ) + xmax = pt.x; - for( i = 1; i < npoints; i++ ) - { - pt = pts[i]; - pt.x = CV_TOGGLE_FLT(pt.x); - pt.y = CV_TOGGLE_FLT(pt.y); + if( ymin > pt.y ) + ymin = pt.y; - if( xmin > pt.x ) - xmin = pt.x; + if( ymax < pt.y ) + ymax = pt.y; + } + } + else + { + Cv32suf v; + // init values + xmin = xmax = CV_TOGGLE_FLT(pt.x); + ymin = ymax = CV_TOGGLE_FLT(pt.y); - if( xmax < pt.x ) - xmax = pt.x; + for( i = 1; i < npoints; i++ ) + { + pt = pts[i]; + pt.x = CV_TOGGLE_FLT(pt.x); + pt.y = CV_TOGGLE_FLT(pt.y); - if( ymin > pt.y ) - ymin = pt.y; + if( xmin > pt.x ) + xmin = pt.x; - if( ymax < pt.y ) - ymax = pt.y; - } + if( xmax < pt.x ) + xmax = pt.x; + + if( ymin > pt.y ) + ymin = pt.y; - v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); - // because right and bottom sides of the bounding rectangle are not inclusive - // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil - v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); + if( ymax < pt.y ) + ymax = pt.y; } + + v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); + // because right and bottom sides of the bounding rectangle are not inclusive + // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil + v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } +#endif return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); } diff --git a/modules/imgproc/src/sumpixels.avx512_skx.cpp b/modules/imgproc/src/sumpixels.avx512_skx.cpp new file mode 100644 index 0000000000..7e5cbdcf88 --- /dev/null +++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp @@ -0,0 +1,262 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2019, Intel Corporation, all rights reserved. +#include "precomp.hpp" +#include "sumpixels.hpp" + +namespace cv { +namespace { // Anonymous namespace to avoid exposing the implementation classes + +// +// NOTE: Look at the bottom of the file for the entry-point function for external callers +// + +// At the moment only 3 channel support untilted is supported +// More channel support coming soon. +// TODO: Add support for sqsum and 1,2, and 4 channels +class IntegralCalculator_3Channel { +public: + IntegralCalculator_3Channel() {}; + + + void calculate_integral_avx512(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn) + { + const int srcstep = (int)(_srcstep/sizeof(uchar)); + const int sumstep = (int)(_sumstep/sizeof(double)); + const int sqsumstep = (int)(_sqsumstep/sizeof(double)); + const int ops_per_line = width * cn; + + // Clear the first line of the sum as per spec (see integral documentation) + // Also adjust the index of sum and sqsum to be at the real 0th element + // and not point to the border pixel so it stays in sync with the src pointer + memset( sum, 0, (ops_per_line+cn)*sizeof(double)); + sum += cn; + + if (sqsum) { + memset( sqsum, 0, (ops_per_line+cn)*sizeof(double)); + sqsum += cn; + } + + // Now calculate the integral over the whole image one line at a time + for(int y = 0; y < height; y++) { + const uchar * src_line = &src[y*srcstep]; + double * sum_above = &sum[y*sumstep]; + double * sum_line = &sum_above[sumstep]; + double * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep] : NULL; + double * sqsum_line = (sqsum) ? &sqsum_above[sqsumstep] : NULL; + + integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line); + + } + } + + static inline + void integral_line_3channel_avx512(const uchar *srcs, + double *sums, double *sums_above, + double *sqsums, double *sqsums_above, + int num_ops_in_line) + { + __m512i sum_accumulator = _mm512_setzero_si512(); // holds rolling sums for the line + __m512i sqsum_accumulator = _mm512_setzero_si512(); // holds rolling sqsums for the line + + // The first element on each line must be zeroes as per spec (see integral documentation) + set_border_pixel_value(sums, sqsums); + + // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk + aligned_integral( srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); + post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); + + } + + + static inline + void set_border_pixel_value(double *sums, double *sqsums) + { + // Sets the border pixel value to 0s. + // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now + __m512i zeroes = _mm512_setzero_si512(); + + _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes); + if (sqsums) + _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes); + } + + + static inline + void aligned_integral(const uchar *&srcs, + double *&sums, double *&sums_above, + double *&sqsum, double *&sqsum_above, + __m512i &sum_accumulator, __m512i &sqsum_accumulator, + int num_ops_in_line) + { + // This function handles full 64 byte chunks of the source data at a time until it gets to the part of + // the line that no longer contains a full 64 byte chunk. Other code will handle the last part. + + const int num_chunks = num_ops_in_line >> 6; // quick int divide by 64 + + for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){ + integral_64_operations_avx512((__m512i *) srcs, + (__m512i *) sums, (__m512i *) sums_above, + (__m512i *) sqsum, (__m512i *) sqsum_above, + 0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator); + srcs+=64; sums+=64; sums_above+=64; + if (sqsum){ sqsum+= 64; sqsum_above+=64; } + } + } + + + static inline + void post_aligned_integral(const uchar *srcs, + const double *sums, const double *sums_above, + const double *sqsum, const double *sqsum_above, + __m512i &sum_accumulator, __m512i &sqsum_accumulator, + int num_ops_in_line) + { + // This function handles the last few straggling operations that are not a full chunk of 64 operations + // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64). + + const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F; // Quick int modulo 64 + + if (num_operations > 0) { + __mmask64 operation_mask = (1ULL << num_operations) - 1ULL; + + integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above, + (__m512i *) sqsum, (__m512i *) sqsum_above, + operation_mask, sum_accumulator, sqsum_accumulator); + } + } + + + static inline + void integral_64_operations_avx512(const __m512i *srcs, + __m512i *sums, const __m512i *sums_above, + __m512i *sqsums, const __m512i *sqsums_above, + __mmask64 data_mask, + __m512i &sum_accumulator, __m512i &sqsum_accumulator) + { + __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask); + + for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) { + __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data + + for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) { + + __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes); + + // Calculate integral for the sum on the 8 entries + integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator); + sums++; sums_above++; + + if (sqsums){ // Calculate integral for the sum on the 8 entries + __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs); + + integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator); + sqsums++; sqsums_above++; + } + + // Prepare for next iteration of inner loop + // shift source to align next 8 bytes to lane 0 and shift the mask + src_16bytes = shift_right_8_bytes(src_16bytes); + data_mask = data_mask >> 8; + + } + + // Prepare for next iteration of outer loop + src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk); + } + } + + + static inline + void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask, + __m512i *results_ptr, __m512i &accumulator) + { + _mm512_mask_storeu_pd( + results_ptr, // Store the result here + data_mask, // Using the data mask to avoid overrunning the line + calculate_integral( // Writing the value of the integral derived from: + src_longs, // input data + _mm512_maskz_loadu_pd(data_mask, above_values_ptr), // and the results from line above + accumulator // keeping track of the accumulator + ) + ); + } + + + static inline + __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) + { + __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5); + + // Align data to prepare for the adds: + // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes + // Vertical LANES: 76543210 + // src_longs : HGFEDCBA + // shited3lanes : + EDCBA + // shifted6lanes : + BA + // carry_over_idxs : + 65765765 (index position of result from previous iteration) + // = integral + __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs); + __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); + __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); + + // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator) + __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes); + __m512i sum_src_carry = _mm512_add_epi64(src_longs, carry_over); + accumulator = _mm512_add_epi64(sum_shift3and6, sum_src_carry); + + // Convert to packed double and add to the line above to get the true integral value + __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); + __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); + return integral_pd; + } + + + static inline + __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask) { + return _mm512_maskz_loadu_epi8(data_mask, srcs); + } + + + static inline + __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes) { + return _mm512_cvtepu8_epi64(src_16bytes); + } + + + static inline + __m128i shift_right_8_bytes(__m128i src_16bytes) { + return _mm_maskz_compress_epi64(2, src_16bytes); + } + + + static inline + __m512i shift_right_16_bytes(__m512i src_64byte_chunk) { + return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk); + } + +}; +} // end of anonymous namespace + +namespace opt_AVX512_SKX { + +// This is the implementation for the external callers interface entry point. +// It should be the only function called into this file from outside +// Any new implementations should be directed from here +void calculate_integral_avx512(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn) +{ + IntegralCalculator_3Channel calculator; + calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn); +} + + +} // end namespace opt_AVX512_SXK +} // end namespace cv diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index 3c49aaf773..ae7647b8bd 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -10,7 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. @@ -44,6 +44,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" #include "opencv2/core/hal/intrin.hpp" +#include "sumpixels.hpp" namespace cv @@ -62,6 +63,37 @@ struct Integral_SIMD } }; + +template <> +struct Integral_SIMD { + Integral_SIMD() {}; + + + bool operator()(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + double *tilted, size_t _tiltedstep, + int width, int height, int cn) const + { +#if CV_TRY_AVX512_SKX + CV_UNUSED(_tiltedstep); + // TODO: Add support for 1,2, and 4 channels + if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){ + opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep, + sqsum, _sqsumstep, width, height, cn); + return true; + } +#else + // Avoid warnings in some builds + CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep); + CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep); + CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn); +#endif + return false; + } + +}; + #if CV_SIMD && CV_SIMD_WIDTH <= 64 template <> diff --git a/modules/imgproc/src/sumpixels.hpp b/modules/imgproc/src/sumpixels.hpp new file mode 100644 index 0000000000..8d5ab0a851 --- /dev/null +++ b/modules/imgproc/src/sumpixels.hpp @@ -0,0 +1,25 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2019, Intel Corporation, all rights reserved. +#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP +#define OPENCV_IMGPROC_SUM_PIXELS_HPP + +namespace cv +{ + +namespace opt_AVX512_SKX +{ +#if CV_TRY_AVX512_SKX + void calculate_integral_avx512( + const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn); + +#endif +} // end namespace opt_AVX512_SKX +} // end namespace cv + +#endif diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index 7c5bb163f6..157a83b603 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -190,82 +190,78 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) int j = 0; const uchar* src = _src.ptr(); uchar* dst = _dst.ptr(); -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - v_uint8x16 thresh_u = v_setall_u8( thresh ); - v_uint8x16 maxval16 = v_setall_u8( maxval ); +#if CV_SIMD + v_uint8 thresh_u = vx_setall_u8( thresh ); + v_uint8 maxval16 = vx_setall_u8( maxval ); - switch( type ) + switch( type ) + { + case THRESH_BINARY: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - case THRESH_BINARY: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = thresh_u < v0; - v0 = v0 & maxval16; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = thresh_u < v0; + v0 = v0 & maxval16; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_BINARY_INV: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_BINARY_INV: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = v0 <= thresh_u; - v0 = v0 & maxval16; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = v0 <= thresh_u; + v0 = v0 & maxval16; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TRUNC: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TRUNC: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = v0 - ( v0 - thresh_u ); - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = v0 - ( v0 - thresh_u ); + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TOZERO: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TOZERO: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = ( thresh_u < v0 ) & v0; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = ( thresh_u < v0 ) & v0; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TOZERO_INV: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TOZERO_INV: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = ( v0 <= thresh_u ) & v0; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = ( v0 <= thresh_u ) & v0; + v_store( dst + j, v0 ); } - break; } + break; } #endif @@ -355,125 +351,156 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) const ushort* src = _src.ptr(); ushort* dst = _dst.ptr(); -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); - if (useSIMD) - { - int i, j; - v_uint16x8 thresh_u = v_setall_u16(thresh); - v_uint16x8 maxval16 = v_setall_u16(maxval); +#if CV_SIMD + int i, j; + v_uint16 thresh_u = vx_setall_u16(thresh); + v_uint16 maxval16 = vx_setall_u16(maxval); - switch (type) + switch (type) + { + case THRESH_BINARY: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { - case THRESH_BINARY: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) { - for (j = 0; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = thresh_u < v0; - v1 = thresh_u < v1; - v0 = v0 & maxval16; - v1 = v1 & maxval16; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } - - for (; j < roi.width; j++) - dst[j] = threshBinary(src[j], thresh, maxval); + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = thresh_u < v0; + v1 = thresh_u < v1; + v0 = v0 & maxval16; + v1 = v1 & maxval16; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; - - case THRESH_BINARY_INV: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + if (j <= roi.width - v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = v0 <= thresh_u; - v1 = v1 <= thresh_u; - v0 = v0 & maxval16; - v1 = v1 & maxval16; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } - - for (; j < roi.width; j++) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + v_uint16 v0 = vx_load(src + j); + v0 = thresh_u < v0; + v0 = v0 & maxval16; + v_store(dst + j, v0); + j += v_uint16::nlanes; } - break; - case THRESH_TRUNC: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) - { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = v_min(v0, thresh_u); - v1 = v_min(v1, thresh_u); - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + for (; j < roi.width; j++) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; - for (; j < roi.width; j++) - dst[j] = threshTrunc(src[j], thresh); + case THRESH_BINARY_INV: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = v0 <= thresh_u; + v1 = v1 <= thresh_u; + v0 = v0 & maxval16; + v1 = v1 & maxval16; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; - - case THRESH_TOZERO: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + if (j <= roi.width - v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = (thresh_u < v0) & v0; - v1 = (thresh_u < v1) & v1; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + v_uint16 v0 = vx_load(src + j); + v0 = v0 <= thresh_u; + v0 = v0 & maxval16; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } - for (; j < roi.width; j++) - dst[j] = threshToZero(src[j], thresh); + for (; j < roi.width; j++) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = v_min(v0, thresh_u); + v1 = v_min(v1, thresh_u); + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); + } + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = v_min(v0, thresh_u); + v_store(dst + j, v0); + j += v_uint16::nlanes; } - break; - case THRESH_TOZERO_INV: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + for (; j < roi.width; j++) + dst[j] = threshTrunc(src[j], thresh); + } + break; + + case THRESH_TOZERO: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = (v0 <= thresh_u) & v0; - v1 = (v1 <= thresh_u) & v1; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = (thresh_u < v0) & v0; + v1 = (thresh_u < v1) & v1; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); + } + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = (thresh_u < v0) & v0; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } - for (; j < roi.width; j++) - dst[j] = threshToZeroInv(src[j], thresh); + for (; j < roi.width; j++) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = (v0 <= thresh_u) & v0; + v1 = (v1 <= thresh_u) & v1; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = (v0 <= thresh_u) & v0; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } + + for (; j < roi.width; j++) + dst[j] = threshToZeroInv(src[j], thresh); } + break; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } } static void @@ -544,128 +571,159 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) } #endif -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - int i, j; - v_int16x8 thresh8 = v_setall_s16( thresh ); - v_int16x8 maxval8 = v_setall_s16( maxval ); +#if CV_SIMD + int i, j; + v_int16 thresh8 = vx_setall_s16( thresh ); + v_int16 maxval8 = vx_setall_s16( maxval ); - switch( type ) + switch( type ) + { + case THRESH_BINARY: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - case THRESH_BINARY: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = thresh8 < v0; - v1 = thresh8 < v1; - v0 = v0 & maxval8; - v1 = v1 & maxval8; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = thresh8 < v0; + v1 = thresh8 < v1; + v0 = v0 & maxval8; + v1 = v1 & maxval8; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - - case THRESH_BINARY_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + if( j <= roi.width - v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = v0 <= thresh8; - v1 = v1 <= thresh8; - v0 = v0 & maxval8; - v1 = v1 & maxval8; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + v_int16 v0 = vx_load( src + j ); + v0 = thresh8 < v0; + v0 = v0 & maxval8; + v_store( dst + j, v0 ); + j += v_int16::nlanes; } - break; - case THRESH_TRUNC: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = v_min( v0, thresh8 ); - v1 = v_min( v1, thresh8 ); - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + for( ; j < roi.width; j++ ) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; - for( ; j < roi.width; j++ ) - dst[j] = threshTrunc( src[j], thresh ); + case THRESH_BINARY_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = v0 <= thresh8; + v1 = v1 <= thresh8; + v0 = v0 & maxval8; + v1 = v1 & maxval8; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - - case THRESH_TOZERO: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + if( j <= roi.width - v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = ( thresh8 < v0 ) & v0; - v1 = ( thresh8 < v1 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + v_int16 v0 = vx_load( src + j ); + v0 = v0 <= thresh8; + v0 = v0 & maxval8; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } - for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); + for( ; j < roi.width; j++ ) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = v_min( v0, thresh8 ); + v1 = v_min( v1, thresh8 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); + } + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh8 ); + v_store( dst + j, v0 ); + j += v_int16::nlanes; } - break; - case THRESH_TOZERO_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + for( ; j < roi.width; j++ ) + dst[j] = threshTrunc( src[j], thresh ); + } + break; + + case THRESH_TOZERO: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = ( v0 <= thresh8 ) & v0; - v1 = ( v1 <= thresh8 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = ( thresh8 < v0 ) & v0; + v1 = ( thresh8 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); + } + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = ( thresh8 < v0 ) & v0; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } - for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); + for( ; j < roi.width; j++ ) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = ( v0 <= thresh8 ) & v0; + v1 = ( v1 <= thresh8 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - default: - CV_Error( CV_StsBadArg, "" ); return; + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh8 ) & v0; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZeroInv(src[j], thresh); } + break; + default: + CV_Error( CV_StsBadArg, "" ); return; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } } @@ -719,175 +777,40 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) } #endif -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - int i, j; - v_float32x4 thresh4 = v_setall_f32( thresh ); - v_float32x4 maxval4 = v_setall_f32( maxval ); - - switch( type ) - { - case THRESH_BINARY: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = thresh4 < v0; - v1 = thresh4 < v1; - v0 = v0 & maxval4; - v1 = v1 & maxval4; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); - } - break; - - case THRESH_BINARY_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = v0 <= thresh4; - v1 = v1 <= thresh4; - v0 = v0 & maxval4; - v1 = v1 & maxval4; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); - } - break; - - case THRESH_TRUNC: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = v_min( v0, thresh4 ); - v1 = v_min( v1, thresh4 ); - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshTrunc(src[j], thresh); - } - break; - - case THRESH_TOZERO: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = ( thresh4 < v0 ) & v0; - v1 = ( thresh4 < v1 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); - } - break; - - case THRESH_TOZERO_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = ( v0 <= thresh4 ) & v0; - v1 = ( v1 <= thresh4 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); - } - break; - default: - CV_Error( CV_StsBadArg, "" ); return; - } - } - else -#endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } -} - -static void -thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) -{ - Size roi = _src.size(); - roi.width *= _src.channels(); - const double* src = _src.ptr(); - double* dst = _dst.ptr(); - size_t src_step = _src.step / sizeof(src[0]); - size_t dst_step = _dst.step / sizeof(dst[0]); - - if (_src.isContinuous() && _dst.isContinuous()) - { - roi.width *= roi.height; - roi.height = 1; - } +#if CV_SIMD + int i, j; + v_float32 thresh4 = vx_setall_f32( thresh ); + v_float32 maxval4 = vx_setall_f32( maxval ); -#if CV_SIMD128_64F - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) + switch( type ) { - int i, j; - v_float64x2 thresh2 = v_setall_f64( thresh ); - v_float64x2 maxval2 = v_setall_f64( maxval ); - - switch( type ) - { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = thresh2 < v0; - v1 = thresh2 < v1; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = thresh4 < v0; + v1 = thresh4 < v1; + v0 = v0 & maxval4; + v1 = v1 & maxval4; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) + { + v_float32 v0 = vx_load( src + j ); + v0 = thresh4 < v0; + v0 = v0 & maxval4; + v_store( dst + j, v0 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); + dst[j] = threshBinary(src[j], thresh, maxval); } break; @@ -895,21 +818,29 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = v0 <= thresh2; - v1 = v1 <= thresh2; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = v0 <= thresh4; + v1 = v1 <= thresh4; + v0 = v0 & maxval4; + v1 = v1 & maxval4; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) + { + v_float32 v0 = vx_load( src + j ); + v0 = v0 <= thresh4; + v0 = v0 & maxval4; + v_store( dst + j, v0 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + dst[j] = threshBinaryInv(src[j], thresh, maxval); } break; @@ -917,19 +848,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = v_min( v0, thresh4 ); + v1 = v_min( v1, thresh4 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = v_min( v0, thresh2 ); - v1 = v_min( v1, thresh2 ); + v_float32 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh4 ); v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshTrunc(src[j], thresh); + dst[j] = threshTrunc(src[j], thresh); } break; @@ -937,19 +875,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = ( thresh4 < v0 ) & v0; + v1 = ( thresh4 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = ( thresh2 < v0 ) & v0; - v1 = ( thresh2 < v1 ) & v1; + v_float32 v0 = vx_load( src + j ); + v0 = ( thresh4 < v0 ) & v0; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); + dst[j] = threshToZero(src[j], thresh); } break; @@ -957,30 +902,205 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = ( v0 <= thresh4 ) & v0; + v1 = ( v1 <= thresh4 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = ( v0 <= thresh2 ) & v0; - v1 = ( v1 <= thresh2 ) & v1; + v_float32 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh4 ) & v0; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); + dst[j] = threshToZeroInv(src[j], thresh); } break; default: - CV_Error(CV_StsBadArg, ""); return; - } + CV_Error( CV_StsBadArg, "" ); return; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif +} + +static void +thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) +{ + Size roi = _src.size(); + roi.width *= _src.channels(); + const double* src = _src.ptr(); + double* dst = _dst.ptr(); + size_t src_step = _src.step / sizeof(src[0]); + size_t dst_step = _dst.step / sizeof(dst[0]); + + if (_src.isContinuous() && _dst.isContinuous()) { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); + roi.width *= roi.height; + roi.height = 1; } + +#if CV_SIMD_64F + int i, j; + v_float64 thresh2 = vx_setall_f64( thresh ); + v_float64 maxval2 = vx_setall_f64( maxval ); + + switch( type ) + { + case THRESH_BINARY: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = thresh2 < v0; + v1 = thresh2 < v1; + v0 = v0 & maxval2; + v1 = v1 & maxval2; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = thresh2 < v0; + v0 = v0 & maxval2; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; + + case THRESH_BINARY_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = v0 <= thresh2; + v1 = v1 <= thresh2; + v0 = v0 & maxval2; + v1 = v1 & maxval2; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = v0 <= thresh2; + v0 = v0 & maxval2; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = v_min( v0, thresh2 ); + v1 = v_min( v1, thresh2 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh2 ); + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshTrunc(src[j], thresh); + } + break; + + case THRESH_TOZERO: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = ( thresh2 < v0 ) & v0; + v1 = ( thresh2 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = ( thresh2 < v0 ) & v0; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = ( v0 <= thresh2 ) & v0; + v1 = ( v1 <= thresh2 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh2 ) & v0; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZeroInv(src[j], thresh); + } + break; + default: + CV_Error(CV_StsBadArg, ""); return; + } +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); +#endif } #ifdef HAVE_IPP diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp index c36bc1fd6e..6ad51ad512 100644 --- a/modules/imgproc/test/test_color.cpp +++ b/modules/imgproc/test/test_color.cpp @@ -3062,4 +3062,14 @@ TEST(ImgProc_BGR2RGBA, 3ch24ch) EXPECT_DOUBLE_EQ(cvtest::norm(expected - dst, NORM_INF), 0.); } +TEST(ImgProc_RGB2YUV, regression_13668) +{ + Mat src(Size(32, 4), CV_8UC3, Scalar(9, 250, 82)); // Ensure that SIMD code path works + Mat dst; + cvtColor(src, dst, COLOR_RGB2YUV); + Vec3b res = dst.at(0, 0); + Vec3b ref(159, 90, 0); + EXPECT_EQ(res, ref); +} + }} // namespace diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp index ec5d4007a3..fadff0c02b 100644 --- a/modules/objdetect/src/qrcode.cpp +++ b/modules/objdetect/src/qrcode.cpp @@ -387,7 +387,7 @@ bool QRDetect::computeTransformationPoints() findNonZero(mask_roi, non_zero_elem[i]); newHull.insert(newHull.end(), non_zero_elem[i].begin(), non_zero_elem[i].end()); } - convexHull(Mat(newHull), locations); + convexHull(newHull, locations); for (size_t i = 0; i < locations.size(); i++) { for (size_t j = 0; j < 3; j++) @@ -556,7 +556,7 @@ vector QRDetect::getQuadrilateral(vector angle_list) } vector integer_hull; - convexHull(Mat(locations), integer_hull); + convexHull(locations, integer_hull); int hull_size = (int)integer_hull.size(); vector hull(hull_size); for (int i = 0; i < hull_size; i++) @@ -901,7 +901,7 @@ bool QRDecode::versionDefinition() vector locations, non_zero_elem; Mat mask_roi = mask(Range(1, intermediate.rows - 1), Range(1, intermediate.cols - 1)); findNonZero(mask_roi, non_zero_elem); - convexHull(Mat(non_zero_elem), locations); + convexHull(non_zero_elem, locations); Point offset = computeOffset(locations); Point temp_remote = locations[0], remote_point; diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp index 2e6251b210..e06dbbf379 100644 --- a/modules/video/src/optflowgf.cpp +++ b/modules/video/src/optflowgf.cpp @@ -646,8 +646,6 @@ private: Size size = frame0.size(); UMat prevFlowX, prevFlowY, curFlowX, curFlowY; - flowx.create(size, CV_32F); - flowy.create(size, CV_32F); UMat flowx0 = flowx; UMat flowy0 = flowy; @@ -1075,12 +1073,19 @@ private: return false; std::vector flowar; - if (!_flow0.empty()) + + // If flag is set, check for integrity; if not set, allocate memory space + if (flags_ & OPTFLOW_USE_INITIAL_FLOW) + { + if (_flow0.empty() || _flow0.size() != _prev0.size() || _flow0.channels() != 2 || + _flow0.depth() != CV_32F) + return false; split(_flow0, flowar); + } else { - flowar.push_back(UMat()); - flowar.push_back(UMat()); + flowar.push_back(UMat(_prev0.size(), CV_32FC1)); + flowar.push_back(UMat(_prev0.size(), CV_32FC1)); } if(!this->operator()(_prev0.getUMat(), _next0.getUMat(), flowar[0], flowar[1])){ return false; @@ -1112,7 +1117,14 @@ void FarnebackOpticalFlowImpl::calc(InputArray _prev0, InputArray _next0, CV_Assert( prev0.size() == next0.size() && prev0.channels() == next0.channels() && prev0.channels() == 1 && pyrScale_ < 1 ); - _flow0.create( prev0.size(), CV_32FC2 ); + + // If flag is set, check for integrity; if not set, allocate memory space + if( flags_ & OPTFLOW_USE_INITIAL_FLOW ) + CV_Assert( _flow0.size() == prev0.size() && _flow0.channels() == 2 && + _flow0.depth() == CV_32F ); + else + _flow0.create( prev0.size(), CV_32FC2 ); + Mat flow0 = _flow0.getMat(); for( k = 0, scale = 1; k < levels; k++ ) diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml index 0e7198a660..d177a09aab 100644 --- a/samples/dnn/models.yml +++ b/samples/dnn/models.yml @@ -90,6 +90,18 @@ squeezenet: classes: "classification_classes_ILSVRC2012.txt" sample: "classification" +# Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet +googlenet: + model: "bvlc_googlenet.caffemodel" + config: "bvlc_googlenet.prototxt" + mean: [104, 117, 123] + scale: 1.0 + width: 224 + height: 224 + rgb: false + classes: "classification_classes_ILSVRC2012.txt" + sample: "classification" + ################################################################################ # Semantic segmentation models. ################################################################################ diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py index bf04c42174..a644420780 100644 --- a/samples/dnn/tf_text_graph_common.py +++ b/samples/dnn/tf_text_graph_common.py @@ -289,7 +289,7 @@ def removeUnusedNodesAndAttrs(to_remove, graph_def): op = graph_def.node[i].op name = graph_def.node[i].name - if op == 'Const' or to_remove(name, op): + if to_remove(name, op): if op != 'Const': removedNodes.append(name) diff --git a/samples/dnn/tf_text_graph_faster_rcnn.py b/samples/dnn/tf_text_graph_faster_rcnn.py index 13a9c29ec0..e1dfba9fee 100644 --- a/samples/dnn/tf_text_graph_faster_rcnn.py +++ b/samples/dnn/tf_text_graph_faster_rcnn.py @@ -48,10 +48,42 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath): removeIdentity(graph_def) + nodesToKeep = [] def to_remove(name, op): - return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ + if name in nodesToKeep: + return False + return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ (name.startswith('CropAndResize') and op != 'CropAndResize') + # Fuse atrous convolutions (with dilations). + nodesMap = {node.name: node for node in graph_def.node} + for node in reversed(graph_def.node): + if node.op == 'BatchToSpaceND': + del node.input[2] + conv = nodesMap[node.input[0]] + spaceToBatchND = nodesMap[conv.input[0]] + + # Extract paddings + stridedSlice = nodesMap[spaceToBatchND.input[2]] + assert(stridedSlice.op == 'StridedSlice') + pack = nodesMap[stridedSlice.input[0]] + assert(pack.op == 'Pack') + + padNodeH = nodesMap[nodesMap[pack.input[0]].input[0]] + padNodeW = nodesMap[nodesMap[pack.input[1]].input[0]] + padH = int(padNodeH.attr['value']['tensor'][0]['int_val'][0]) + padW = int(padNodeW.attr['value']['tensor'][0]['int_val'][0]) + + paddingsNode = NodeDef() + paddingsNode.name = conv.name + '/paddings' + paddingsNode.op = 'Const' + paddingsNode.addAttr('value', [padH, padH, padW, padW]) + graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode) + nodesToKeep.append(paddingsNode.name) + + spaceToBatchND.input[2] = paddingsNode.name + + removeUnusedNodesAndAttrs(to_remove, graph_def) @@ -225,6 +257,26 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath): detectionOut.addAttr('variance_encoded_in_target', True) graph_def.node.extend([detectionOut]) + def getUnconnectedNodes(): + unconnected = [node.name for node in graph_def.node] + for node in graph_def.node: + for inp in node.input: + if inp in unconnected: + unconnected.remove(inp) + return unconnected + + while True: + unconnectedNodes = getUnconnectedNodes() + unconnectedNodes.remove(detectionOut.name) + if not unconnectedNodes: + break + + for name in unconnectedNodes: + for i in range(len(graph_def.node)): + if graph_def.node[i].name == name: + del graph_def.node[i] + break + # Save as text. graph_def.save(outputPath) diff --git a/samples/dnn/tf_text_graph_mask_rcnn.py b/samples/dnn/tf_text_graph_mask_rcnn.py index aaefe456ad..c8803088f9 100644 --- a/samples/dnn/tf_text_graph_mask_rcnn.py +++ b/samples/dnn/tf_text_graph_mask_rcnn.py @@ -55,7 +55,7 @@ graph_def = parseTextGraph(args.output) removeIdentity(graph_def) def to_remove(name, op): - return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ + return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ (name.startswith('CropAndResize') and op != 'CropAndResize') removeUnusedNodesAndAttrs(to_remove, graph_def) diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py index 5017dba7a7..1576380646 100644 --- a/samples/dnn/tf_text_graph_ssd.py +++ b/samples/dnn/tf_text_graph_ssd.py @@ -10,14 +10,60 @@ # Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function. # See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API import argparse +import re from math import sqrt from tf_text_graph_common import * +class SSDAnchorGenerator: + def __init__(self, min_scale, max_scale, num_layers, aspect_ratios, + reduce_boxes_in_lowest_layer, image_width, image_height): + self.min_scale = min_scale + self.aspect_ratios = aspect_ratios + self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer + self.image_width = image_width + self.image_height = image_height + self.scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1) + for i in range(num_layers)] + [1.0] + + def get(self, layer_id): + if layer_id == 0 and self.reduce_boxes_in_lowest_layer: + widths = [0.1, self.min_scale * sqrt(2.0), self.min_scale * sqrt(0.5)] + heights = [0.1, self.min_scale / sqrt(2.0), self.min_scale / sqrt(0.5)] + else: + widths = [self.scales[layer_id] * sqrt(ar) for ar in self.aspect_ratios] + heights = [self.scales[layer_id] / sqrt(ar) for ar in self.aspect_ratios] + + widths += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])] + heights += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])] + widths = [w * self.image_width for w in widths] + heights = [h * self.image_height for h in heights] + return widths, heights + + +class MultiscaleAnchorGenerator: + def __init__(self, min_level, aspect_ratios, scales_per_octave, anchor_scale): + self.min_level = min_level + self.aspect_ratios = aspect_ratios + self.anchor_scale = anchor_scale + self.scales = [2**(float(s) / scales_per_octave) for s in range(scales_per_octave)] + + def get(self, layer_id): + widths = [] + heights = [] + for a in self.aspect_ratios: + for s in self.scales: + base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale + ar = sqrt(a) + heights.append(base_anchor_size * s / ar) + widths.append(base_anchor_size * s * ar) + return widths, heights + + def createSSDGraph(modelPath, configPath, outputPath): # Nodes that should be kept. - keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu6', 'Placeholder', 'FusedBatchNorm', + keepOps = ['Conv2D', 'BiasAdd', 'Add', 'Relu', 'Relu6', 'Placeholder', 'FusedBatchNorm', 'DepthwiseConv2dNative', 'ConcatV2', 'Mul', 'MaxPool', 'AvgPool', 'Identity', - 'Sub'] + 'Sub', 'ResizeNearestNeighbor', 'Pad'] # Node with which prefixes should be removed prefixesToRemove = ('MultipleGridAnchorGenerator/', 'Postprocessor/', 'Preprocessor/map') @@ -27,26 +73,50 @@ def createSSDGraph(modelPath, configPath, outputPath): config = config['model'][0]['ssd'][0] num_classes = int(config['num_classes'][0]) - ssd_anchor_generator = config['anchor_generator'][0]['ssd_anchor_generator'][0] - min_scale = float(ssd_anchor_generator['min_scale'][0]) - max_scale = float(ssd_anchor_generator['max_scale'][0]) - num_layers = int(ssd_anchor_generator['num_layers'][0]) - aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']] - reduce_boxes_in_lowest_layer = True - if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator: - reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true' - fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0] image_width = int(fixed_shape_resizer['width'][0]) image_height = int(fixed_shape_resizer['height'][0]) box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional' + anchor_generator = config['anchor_generator'][0] + if 'ssd_anchor_generator' in anchor_generator: + ssd_anchor_generator = anchor_generator['ssd_anchor_generator'][0] + min_scale = float(ssd_anchor_generator['min_scale'][0]) + max_scale = float(ssd_anchor_generator['max_scale'][0]) + num_layers = int(ssd_anchor_generator['num_layers'][0]) + aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']] + reduce_boxes_in_lowest_layer = True + if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator: + reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true' + priors_generator = SSDAnchorGenerator(min_scale, max_scale, num_layers, + aspect_ratios, reduce_boxes_in_lowest_layer, + image_width, image_height) + + + print('Scale: [%f-%f]' % (min_scale, max_scale)) + print('Aspect ratios: %s' % str(aspect_ratios)) + print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer)) + elif 'multiscale_anchor_generator' in anchor_generator: + multiscale_anchor_generator = anchor_generator['multiscale_anchor_generator'][0] + min_level = int(multiscale_anchor_generator['min_level'][0]) + max_level = int(multiscale_anchor_generator['max_level'][0]) + anchor_scale = float(multiscale_anchor_generator['anchor_scale'][0]) + aspect_ratios = [float(ar) for ar in multiscale_anchor_generator['aspect_ratios']] + scales_per_octave = int(multiscale_anchor_generator['scales_per_octave'][0]) + num_layers = max_level - min_level + 1 + priors_generator = MultiscaleAnchorGenerator(min_level, aspect_ratios, + scales_per_octave, anchor_scale) + print('Levels: [%d-%d]' % (min_level, max_level)) + print('Anchor scale: %f' % anchor_scale) + print('Scales per octave: %d' % scales_per_octave) + print('Aspect ratios: %s' % str(aspect_ratios)) + else: + print('Unknown anchor_generator') + exit(0) + print('Number of classes: %d' % num_classes) print('Number of layers: %d' % num_layers) - print('Scale: [%f-%f]' % (min_scale, max_scale)) - print('Aspect ratios: %s' % str(aspect_ratios)) - print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer)) print('box predictor: %s' % box_predictor) print('Input image size: %dx%d' % (image_width, image_height)) @@ -67,8 +137,8 @@ def createSSDGraph(modelPath, configPath, outputPath): return unconnected - # Detect unfused batch normalization nodes and fuse them. - def fuse_batch_normalization(): + def fuse_nodes(nodesToKeep): + # Detect unfused batch normalization nodes and fuse them. # Add_0 <-- moving_variance, add_y # Rsqrt <-- Add_0 # Mul_0 <-- Rsqrt, gamma @@ -77,9 +147,15 @@ def createSSDGraph(modelPath, configPath, outputPath): # Sub_0 <-- beta, Mul_2 # Add_1 <-- Mul_1, Sub_0 nodesMap = {node.name: node for node in graph_def.node} - subgraph = ['Add', + subgraphBatchNorm = ['Add', ['Mul', 'input', ['Mul', ['Rsqrt', ['Add', 'moving_variance', 'add_y']], 'gamma']], ['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]] + # Detect unfused nearest neighbor resize. + subgraphResizeNN = ['Reshape', + ['Mul', ['Reshape', 'input', ['Pack', 'shape_1', 'shape_2', 'shape_3', 'shape_4', 'shape_5']], + 'ones'], + ['Pack', ['StridedSlice', ['Shape', 'input'], 'stack', 'stack_1', 'stack_2'], + 'out_height', 'out_width', 'out_channels']] def checkSubgraph(node, targetNode, inputs, fusedNodes): op = targetNode[0] if node.op == op and (len(node.input) >= len(targetNode) - 1): @@ -100,7 +176,7 @@ def createSSDGraph(modelPath, configPath, outputPath): for node in graph_def.node: inputs = {} fusedNodes = [] - if checkSubgraph(node, subgraph, inputs, fusedNodes): + if checkSubgraph(node, subgraphBatchNorm, inputs, fusedNodes): name = node.name node.Clear() node.name = name @@ -112,15 +188,41 @@ def createSSDGraph(modelPath, configPath, outputPath): node.input.append(inputs['moving_variance']) node.addAttr('epsilon', 0.001) nodesToRemove += fusedNodes[1:] + + inputs = {} + fusedNodes = [] + if checkSubgraph(node, subgraphResizeNN, inputs, fusedNodes): + name = node.name + node.Clear() + node.name = name + node.op = 'ResizeNearestNeighbor' + node.input.append(inputs['input']) + node.input.append(name + '/output_shape') + + out_height_node = nodesMap[inputs['out_height']] + out_width_node = nodesMap[inputs['out_width']] + out_height = int(out_height_node.attr['value']['tensor'][0]['int_val'][0]) + out_width = int(out_width_node.attr['value']['tensor'][0]['int_val'][0]) + + shapeNode = NodeDef() + shapeNode.name = name + '/output_shape' + shapeNode.op = 'Const' + shapeNode.addAttr('value', [out_height, out_width]) + graph_def.node.insert(graph_def.node.index(node), shapeNode) + nodesToKeep.append(shapeNode.name) + + nodesToRemove += fusedNodes[1:] for node in nodesToRemove: graph_def.node.remove(node) - fuse_batch_normalization() + nodesToKeep = [] + fuse_nodes(nodesToKeep) removeIdentity(graph_def) def to_remove(name, op): - return (not op in keepOps) or name.startswith(prefixesToRemove) + return (not name in nodesToKeep) and \ + (op == 'Const' or (not op in keepOps) or name.startswith(prefixesToRemove)) removeUnusedNodesAndAttrs(to_remove, graph_def) @@ -169,19 +271,15 @@ def createSSDGraph(modelPath, configPath, outputPath): graph_def.node.extend([flatten]) addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten') - idx = 0 + num_matched_layers = 0 for node in graph_def.node: - if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx) or \ - node.name == ('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/Conv2D' % idx) or \ - node.name == 'WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D': + if re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \ + re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name): node.addAttr('loc_pred_transposed', True) - idx += 1 - assert(idx == num_layers) + num_matched_layers += 1 + assert(num_matched_layers == num_layers) # Add layers that generate anchors (bounding boxes proposals). - scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1) - for i in range(num_layers)] + [1.0] - priorBoxes = [] for i in range(num_layers): priorBox = NodeDef() @@ -199,17 +297,8 @@ def createSSDGraph(modelPath, configPath, outputPath): priorBox.addAttr('flip', False) priorBox.addAttr('clip', False) - if i == 0 and reduce_boxes_in_lowest_layer: - widths = [0.1, min_scale * sqrt(2.0), min_scale * sqrt(0.5)] - heights = [0.1, min_scale / sqrt(2.0), min_scale / sqrt(0.5)] - else: - widths = [scales[i] * sqrt(ar) for ar in aspect_ratios] - heights = [scales[i] / sqrt(ar) for ar in aspect_ratios] + widths, heights = priors_generator.get(i) - widths += [sqrt(scales[i] * scales[i + 1])] - heights += [sqrt(scales[i] * scales[i + 1])] - widths = [w * image_width for w in widths] - heights = [h * image_height for h in heights] priorBox.addAttr('width', widths) priorBox.addAttr('height', heights) priorBox.addAttr('variance', [0.1, 0.1, 0.2, 0.2]) @@ -217,6 +306,7 @@ def createSSDGraph(modelPath, configPath, outputPath): graph_def.node.extend([priorBox]) priorBoxes.append(priorBox.name) + # Compare this layer's output with Postprocessor/Reshape addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten') # Sigmoid for classes predictions and DetectionOutput layer