diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp index 7ec6bc55cd..43877e4848 100644 --- a/modules/video/include/opencv2/video/tracking.hpp +++ b/modules/video/include/opencv2/video/tracking.hpp @@ -849,6 +849,43 @@ public: //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE; }; +/** @brief the Nano tracker is a super lightweight dnn-based general object tracking. + * + * Nano tracker is much faster and extremely lightweight due to special model structure, the whole model size is about 1.1 MB. + * Nano tracker needs two models: one for feature extraction (backbone) and the another for localization (neckhead). + * Please download these two onnx models at:https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack/models/onnx. + * Original repo is here: https://github.com/HonglinChu/NanoTrack + * Author:HongLinChu, 1628464345@qq.com + */ +class CV_EXPORTS_W TrackerNano : public Tracker +{ +protected: + TrackerNano(); // use ::create() +public: + virtual ~TrackerNano() CV_OVERRIDE; + + struct CV_EXPORTS_W_SIMPLE Params + { + CV_WRAP Params(); + CV_PROP_RW std::string backbone; + CV_PROP_RW std::string neckhead; + CV_PROP_RW int backend; + CV_PROP_RW int target; + }; + + /** @brief Constructor + @param parameters NanoTrack parameters TrackerNano::Params + */ + static CV_WRAP + Ptr create(const TrackerNano::Params& parameters = TrackerNano::Params()); + + /** @brief Return tracking score + */ + CV_WRAP virtual float getTrackingScore() = 0; + + //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE; + //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE; +}; //! @} video_track diff --git a/modules/video/misc/python/pyopencv_video.hpp b/modules/video/misc/python/pyopencv_video.hpp index ea8977911f..02d890d859 100644 --- a/modules/video/misc/python/pyopencv_video.hpp +++ b/modules/video/misc/python/pyopencv_video.hpp @@ -2,4 +2,5 @@ typedef TrackerMIL::Params TrackerMIL_Params; typedef TrackerGOTURN::Params TrackerGOTURN_Params; typedef TrackerDaSiamRPN::Params TrackerDaSiamRPN_Params; +typedef TrackerNano::Params TrackerNano_Params; #endif diff --git a/modules/video/src/tracking/tracker_nano.cpp b/modules/video/src/tracking/tracker_nano.cpp new file mode 100644 index 0000000000..aaf2d0f2e9 --- /dev/null +++ b/modules/video/src/tracking/tracker_nano.cpp @@ -0,0 +1,359 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// This file is modified from the https://github.com/HonglinChu/NanoTrack/blob/master/ncnn_macos_nanotrack/nanotrack.cpp +// Author, HongLinChu, 1628464345@qq.com +// Adapt to OpenCV, ZihaoMu: zihaomu@outlook.com + +// Link to original inference code: https://github.com/HonglinChu/NanoTrack +// Link to original training repo: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack + +#include "../precomp.hpp" +#ifdef HAVE_OPENCV_DNN +#include "opencv2/dnn.hpp" +#endif + +namespace cv { + +TrackerNano::TrackerNano() +{ + // nothing +} + +TrackerNano::~TrackerNano() +{ + // nothing +} + +TrackerNano::Params::Params() +{ + backbone = "backbone.onnx"; + neckhead = "neckhead.onnx"; +#ifdef HAVE_OPENCV_DNN + backend = dnn::DNN_BACKEND_DEFAULT; + target = dnn::DNN_TARGET_CPU; +#else + backend = -1; // invalid value + target = -1; // invalid value +#endif +} + +#ifdef HAVE_OPENCV_DNN +static void softmax(const Mat& src, Mat& dst) +{ + Mat maxVal; + cv::max(src.row(1), src.row(0), maxVal); + + src.row(1) -= maxVal; + src.row(0) -= maxVal; + + exp(src, dst); + + Mat sumVal = dst.row(0) + dst.row(1); + dst.row(0) = dst.row(0) / sumVal; + dst.row(1) = dst.row(1) / sumVal; +} + +static float sizeCal(float w, float h) +{ + float pad = (w + h) * 0.5f; + float sz2 = (w + pad) * (h + pad); + return sqrt(sz2); +} + +static Mat sizeCal(const Mat& w, const Mat& h) +{ + Mat pad = (w + h) * 0.5; + Mat sz2 = (w + pad).mul((h + pad)); + + cv::sqrt(sz2, sz2); + return sz2; +} + +// Similar python code: r = np.maximum(r, 1. / r) # r is matrix +static void elementReciprocalMax(Mat& srcDst) +{ + size_t totalV = srcDst.total(); + float* ptr = srcDst.ptr(0); + for (size_t i = 0; i < totalV; i++) + { + float val = *(ptr + i); + *(ptr + i) = std::max(val, 1.0f/val); + } +} + +class TrackerNanoImpl : public TrackerNano +{ +public: + TrackerNanoImpl(const TrackerNano::Params& parameters) + : params(parameters) + { + backbone = dnn::readNet(params.backbone); + neckhead = dnn::readNet(params.neckhead); + + CV_Assert(!backbone.empty()); + CV_Assert(!neckhead.empty()); + + backbone.setPreferableBackend(params.backend); + backbone.setPreferableTarget(params.target); + neckhead.setPreferableBackend(params.backend); + neckhead.setPreferableTarget(params.target); + } + + void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE; + bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE; + float getTrackingScore() CV_OVERRIDE; + + // Save the target bounding box for each frame. + std::vector targetSz = {0, 0}; // H and W of bounding box + std::vector targetPos = {0, 0}; // center point of bounding box (x, y) + float tracking_score; + + TrackerNano::Params params; + + struct trackerConfig + { + float windowInfluence = 0.455f; + float lr = 0.37f; + float contextAmount = 0.5; + bool swapRB = true; + int totalStride = 16; + float penaltyK = 0.055f; + }; + +protected: + const int exemplarSize = 127; + const int instanceSize = 255; + + trackerConfig trackState; + int scoreSize; + Size imgSize = {0, 0}; + Mat hanningWindow; + Mat grid2searchX, grid2searchY; + + dnn::Net backbone, neckhead; + Mat image; + + void getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz); + void generateGrids(); +}; + +void TrackerNanoImpl::generateGrids() +{ + int sz = scoreSize; + const int sz2 = sz / 2; + + std::vector x1Vec(sz, 0); + + for (int i = 0; i < sz; i++) + { + x1Vec[i] = i - sz2; + } + + Mat x1M(1, sz, CV_32FC1, x1Vec.data()); + + cv::repeat(x1M, sz, 1, grid2searchX); + cv::repeat(x1M.t(), 1, sz, grid2searchY); + + grid2searchX *= trackState.totalStride; + grid2searchY *= trackState.totalStride; + + grid2searchX += instanceSize/2; + grid2searchY += instanceSize/2; +} + +void TrackerNanoImpl::init(InputArray image_, const Rect &boundingBox_) +{ + scoreSize = (instanceSize - exemplarSize) / trackState.totalStride + 8; + trackState = trackerConfig(); + image = image_.getMat().clone(); + + // convert Rect2d from left-up to center. + targetPos[0] = float(boundingBox_.x) + float(boundingBox_.width) * 0.5f; + targetPos[1] = float(boundingBox_.y) + float(boundingBox_.height) * 0.5f; + + targetSz[0] = float(boundingBox_.width); + targetSz[1] = float(boundingBox_.height); + + imgSize = image.size(); + + // Extent the bounding box. + float sumSz = targetSz[0] + targetSz[1]; + float wExtent = targetSz[0] + trackState.contextAmount * (sumSz); + float hExtent = targetSz[1] + trackState.contextAmount * (sumSz); + int sz = int(cv::sqrt(wExtent * hExtent)); + + Mat crop; + getSubwindow(crop, image, sz, exemplarSize); + Mat blob = dnn::blobFromImage(crop, 1.0, Size(), Scalar(), trackState.swapRB); + + backbone.setInput(blob); + Mat out = backbone.forward(); // Feature extraction. + neckhead.setInput(out, "input1"); + + createHanningWindow(hanningWindow, Size(scoreSize, scoreSize), CV_32F); + generateGrids(); +} + +void TrackerNanoImpl::getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz) +{ + Scalar avgChans = mean(srcImg); + Size imgSz = srcImg.size(); + int c = (originalSz + 1) / 2; + + int context_xmin = targetPos[0] - c; + int context_xmax = context_xmin + originalSz - 1; + int context_ymin = targetPos[1] - c; + int context_ymax = context_ymin + originalSz - 1; + + int left_pad = std::max(0, -context_xmin); + int top_pad = std::max(0, -context_ymin); + int right_pad = std::max(0, context_xmax - imgSz.width + 1); + int bottom_pad = std::max(0, context_ymax - imgSz.height + 1); + + context_xmin += left_pad; + context_xmax += left_pad; + context_ymin += top_pad; + context_ymax += top_pad; + + Mat cropImg; + if (left_pad == 0 && top_pad == 0 && right_pad == 0 && bottom_pad == 0) + { + // Crop image without padding. + cropImg = srcImg(cv::Rect(context_xmin, context_ymin, + context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); + } + else // Crop image with padding, and the padding value is avgChans + { + cv::Mat tmpMat; + cv::copyMakeBorder(srcImg, tmpMat, top_pad, bottom_pad, left_pad, right_pad, cv::BORDER_CONSTANT, avgChans); + cropImg = tmpMat(cv::Rect(context_xmin, context_ymin, context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); + } + resize(cropImg, dstCrop, Size(resizeSz, resizeSz)); +} + +bool TrackerNanoImpl::update(InputArray image_, Rect &boundingBoxRes) +{ + image = image_.getMat().clone(); + int targetSzSum = targetSz[0] + targetSz[1]; + + float wc = targetSz[0] + trackState.contextAmount * targetSzSum; + float hc = targetSz[1] + trackState.contextAmount * targetSzSum; + float sz = cv::sqrt(wc * hc); + float scale_z = exemplarSize / sz; + float sx = sz * (instanceSize / exemplarSize); + targetSz[0] *= scale_z; + targetSz[1] *= scale_z; + + Mat crop; + getSubwindow(crop, image, int(sx), instanceSize); + + Mat blob = dnn::blobFromImage(crop, 1.0, Size(), Scalar(), trackState.swapRB); + backbone.setInput(blob); + Mat xf = backbone.forward(); + neckhead.setInput(xf, "input2"); + std::vector outputName = {"output1", "output2"}; + std::vector outs; + neckhead.forward(outs, outputName); + + CV_Assert(outs.size() == 2); + + Mat clsScore = outs[0]; // 1x2x16x16 + Mat bboxPred = outs[1]; // 1x4x16x16 + + clsScore = clsScore.reshape(0, {2, scoreSize, scoreSize}); + bboxPred = bboxPred.reshape(0, {4, scoreSize, scoreSize}); + + Mat scoreSoftmax; // 2x16x16 + softmax(clsScore, scoreSoftmax); + + Mat score = scoreSoftmax.row(1); + score = score.reshape(0, {scoreSize, scoreSize}); + + Mat predX1 = grid2searchX - bboxPred.row(0).reshape(0, {scoreSize, scoreSize}); + Mat predY1 = grid2searchY - bboxPred.row(1).reshape(0, {scoreSize, scoreSize}); + Mat predX2 = grid2searchX + bboxPred.row(2).reshape(0, {scoreSize, scoreSize}); + Mat predY2 = grid2searchY + bboxPred.row(3).reshape(0, {scoreSize, scoreSize}); + + // size penalty + // scale penalty + Mat sc = sizeCal(predX2 - predX1, predY2 - predY1)/sizeCal(targetPos[0], targetPos[1]); + elementReciprocalMax(sc); + + // ratio penalty + float ratioVal = targetSz[0] / targetSz[1]; + + Mat ratioM(scoreSize, scoreSize, CV_32FC1, Scalar::all(ratioVal)); + Mat rc = ratioM / ((predX2 - predX1) / (predY2 - predY1)); + elementReciprocalMax(rc); + + Mat penalty; + exp(((rc.mul(sc) - 1) * trackState.penaltyK * (-1)), penalty); + Mat pscore = penalty.mul(score); + + // Window penalty + pscore = pscore * (1.0 - trackState.windowInfluence) + hanningWindow * trackState.windowInfluence; + + // get Max + int bestID[2] = { 0, 0 }; + minMaxIdx(pscore, 0, 0, 0, bestID); + + tracking_score = pscore.at(bestID); + + float x1Val = predX1.at(bestID); + float x2Val = predX2.at(bestID); + float y1Val = predY1.at(bestID); + float y2Val = predY2.at(bestID); + + float predXs = (x1Val + x2Val)/2; + float predYs = (y1Val + y2Val)/2; + float predW = (x2Val - x1Val)/scale_z; + float predH = (y2Val - y1Val)/scale_z; + + float diffXs = (predXs - instanceSize / 2) / scale_z; + float diffYs = (predYs - instanceSize / 2) / scale_z; + + targetSz[0] /= scale_z; + targetSz[1] /= scale_z; + + float lr = penalty.at(bestID) * score.at(bestID) * trackState.lr; + + float resX = targetPos[0] + diffXs; + float resY = targetPos[1] + diffYs; + float resW = predW * lr + (1 - lr) * targetSz[0]; + float resH = predH * lr + (1 - lr) * targetSz[1]; + + resX = std::max(0.f, std::min((float)imgSize.width, resX)); + resY = std::max(0.f, std::min((float)imgSize.height, resY)); + resW = std::max(10.f, std::min((float)imgSize.width, resW)); + resH = std::max(10.f, std::min((float)imgSize.height, resH)); + + targetPos[0] = resX; + targetPos[1] = resY; + targetSz[0] = resW; + targetSz[1] = resH; + + // convert center to Rect. + boundingBoxRes = { int(resX - resW/2), int(resY - resH/2), int(resW), int(resH)}; + return true; +} + +float TrackerNanoImpl::getTrackingScore() +{ + return tracking_score; +} + +Ptr TrackerNano::create(const TrackerNano::Params& parameters) +{ + return makePtr(parameters); +} + +#else // OPENCV_HAVE_DNN +Ptr TrackerNano::create(const TrackerNano::Params& parameters) +{ + CV_UNUSED(parameters); + CV_Error(cv::Error::StsNotImplemented, "to use NanoTrack, the tracking module needs to be built with opencv_dnn !"); +} +#endif // OPENCV_HAVE_DNN +} diff --git a/modules/video/test/test_trackers.cpp b/modules/video/test/test_trackers.cpp index 2d0e184408..d080198116 100644 --- a/modules/video/test/test_trackers.cpp +++ b/modules/video/test/test_trackers.cpp @@ -64,40 +64,67 @@ TEST_P(DistanceAndOverlap, GOTURN) INSTANTIATE_TEST_CASE_P(Tracking, DistanceAndOverlap, TESTSET_NAMES); -TEST(GOTURN, memory_usage) +static bool checkIOU(const Rect& r0, const Rect& r1, double threshold) { - cv::Rect roi(145, 70, 85, 85); + int interArea = (r0 & r1).area(); + double iouVal = (interArea * 1.0 )/ (r0.area() + r1.area() - interArea);; + if (iouVal > threshold) + return true; + else + { + std::cout <<"Unmatched IOU: expect IOU val ("< the IOU threadhold ("<& tracker, double iouThreshold = 0.8) +{ + // Template image + Mat img0 = imread(findDataFile("tracking/bag/00000001.jpg"), 1); + + // Tracking image sequence. + std::vector imgs; + imgs.push_back(imread(findDataFile("tracking/bag/00000002.jpg"), 1)); + imgs.push_back(imread(findDataFile("tracking/bag/00000003.jpg"), 1)); + imgs.push_back(imread(findDataFile("tracking/bag/00000004.jpg"), 1)); + imgs.push_back(imread(findDataFile("tracking/bag/00000005.jpg"), 1)); + imgs.push_back(imread(findDataFile("tracking/bag/00000006.jpg"), 1)); + + cv::Rect roi(325, 164, 100, 100); + std::vector targetRois; + targetRois.push_back(cv::Rect(278, 133, 99, 104)); + targetRois.push_back(cv::Rect(293, 88, 93, 110)); + targetRois.push_back(cv::Rect(287, 76, 89, 116)); + targetRois.push_back(cv::Rect(297, 74, 82, 122)); + targetRois.push_back(cv::Rect(311, 83, 78, 125)); + + tracker->init(img0, roi); + CV_Assert(targetRois.size() == imgs.size()); + + for (int i = 0; i < (int)imgs.size(); i++) + { + bool res = tracker->update(imgs[i], roi); + ASSERT_TRUE(res); + ASSERT_TRUE(checkIOU(roi, targetRois[i], iouThreshold)) << cv::format("Fail at img %d.",i); + } +} + +TEST(GOTURN, accuracy) +{ std::string model = cvtest::findDataFile("dnn/gsoc2016-goturn/goturn.prototxt"); std::string weights = cvtest::findDataFile("dnn/gsoc2016-goturn/goturn.caffemodel", false); cv::TrackerGOTURN::Params params; params.modelTxt = model; params.modelBin = weights; cv::Ptr tracker = TrackerGOTURN::create(params); - - string inputVideo = cvtest::findDataFile("tracking/david/data/david.webm"); - cv::VideoCapture video(inputVideo); - ASSERT_TRUE(video.isOpened()) << inputVideo; - - cv::Mat frame; - video >> frame; - ASSERT_FALSE(frame.empty()) << inputVideo; - tracker->init(frame, roi); - string ground_truth_bb; - for (int nframes = 0; nframes < 15; ++nframes) - { - std::cout << "Frame: " << nframes << std::endl; - video >> frame; - bool res = tracker->update(frame, roi); - ASSERT_TRUE(res); - std::cout << "Predicted ROI: " << roi << std::endl; - } + // TODO! GOTURN have low accuracy. Try to remove this api at 5.x. + checkTrackingAccuracy(tracker, 0.08); } -TEST(DaSiamRPN, memory_usage) +TEST(DaSiamRPN, accuracy) { - cv::Rect roi(145, 70, 85, 85); - std::string model = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_model.onnx", false); std::string kernel_r1 = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_kernel_r1.onnx", false); std::string kernel_cls1 = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_kernel_cls1.onnx", false); @@ -106,24 +133,18 @@ TEST(DaSiamRPN, memory_usage) params.kernel_r1 = kernel_r1; params.kernel_cls1 = kernel_cls1; cv::Ptr tracker = TrackerDaSiamRPN::create(params); - - string inputVideo = cvtest::findDataFile("tracking/david/data/david.webm"); - cv::VideoCapture video(inputVideo); - ASSERT_TRUE(video.isOpened()) << inputVideo; - - cv::Mat frame; - video >> frame; - ASSERT_FALSE(frame.empty()) << inputVideo; - tracker->init(frame, roi); - string ground_truth_bb; - for (int nframes = 0; nframes < 15; ++nframes) - { - std::cout << "Frame: " << nframes << std::endl; - video >> frame; - bool res = tracker->update(frame, roi); - ASSERT_TRUE(res); - std::cout << "Predicted ROI: " << roi << std::endl; - } + checkTrackingAccuracy(tracker, 0.7); } +TEST(NanoTrack, accuracy) +{ + std::string backbonePath = cvtest::findDataFile("dnn/onnx/models/nanotrack_backbone_sim.onnx", false); + std::string neckheadPath = cvtest::findDataFile("dnn/onnx/models/nanotrack_head_sim.onnx", false); + + cv::TrackerNano::Params params; + params.backbone = backbonePath; + params.neckhead = neckheadPath; + cv::Ptr tracker = TrackerNano::create(params); + checkTrackingAccuracy(tracker); +} }} // namespace opencv_test:: diff --git a/samples/dnn/nanotrack_tracker.cpp b/samples/dnn/nanotrack_tracker.cpp new file mode 100644 index 0000000000..e98e301f13 --- /dev/null +++ b/samples/dnn/nanotrack_tracker.cpp @@ -0,0 +1,183 @@ +// NanoTrack +// Link to original inference code: https://github.com/HonglinChu/NanoTrack +// Link to original training repo: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack +// backBone model: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/onnx/nanotrack_backbone_sim.onnx +// headNeck model: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/onnx/nanotrack_head_sim.onnx + +#include +#include + +#include +#include +#include +#include + +using namespace cv; +using namespace cv::dnn; + +const char *keys = + "{ help h | | Print help message }" + "{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }" + "{ backbone | backbone.onnx | Path to onnx model of backbone.onnx}" + "{ headneck | headneck.onnx | Path to onnx model of headneck.onnx }" + "{ backend | 0 | Choose one of computation backends: " + "0: automatically (by default), " + "1: Halide language (http://halide-lang.org/), " + "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "3: OpenCV implementation, " + "4: VKCOM, " + "5: CUDA }," + "{ target | 0 | Choose one of target computation devices: " + "0: CPU target (by default), " + "1: OpenCL, " + "2: OpenCL fp16 (half-float precision), " + "3: VPU, " + "4: Vulkan, " + "6: CUDA, " + "7: CUDA fp16 (half-float preprocess) }" +; + +static +int run(int argc, char** argv) +{ + // Parse command line arguments. + CommandLineParser parser(argc, argv, keys); + + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + std::string inputName = parser.get("input"); + std::string backbone = parser.get("backbone"); + std::string headneck = parser.get("headneck"); + int backend = parser.get("backend"); + int target = parser.get("target"); + + Ptr tracker; + try + { + TrackerNano::Params params; + params.backbone = samples::findFile(backbone); + params.neckhead = samples::findFile(headneck); + params.backend = backend; + params.target = target; + tracker = TrackerNano::create(params); + } + catch (const cv::Exception& ee) + { + std::cerr << "Exception: " << ee.what() << std::endl; + std::cout << "Can't load the network by using the following files:" << std::endl; + std::cout << "backbone : " << backbone << std::endl; + std::cout << "headneck : " << headneck << std::endl; + return 2; + } + + const std::string winName = "NanoTrack"; + namedWindow(winName, WINDOW_AUTOSIZE); + + // Open a video file or an image file or a camera stream. + VideoCapture cap; + + if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1)) + { + int c = inputName.empty() ? 0 : inputName[0] - '0'; + std::cout << "Trying to open camera #" << c << " ..." << std::endl; + if (!cap.open(c)) + { + std::cout << "Capture from camera #" << c << " didn't work. Specify -i=