Merge pull request #24201 from lpylpy0514:4.x

VIT track(gsoc realtime object tracking model) #24201 Vit tracker(vision transformer tracker) is a much better model for real-time object tracking. Vit tracker can achieve speeds exceeding nanotrack by 20% in single-threaded mode with ARM chip, and the advantage becomes even more pronounced in multi-threaded mode. In addition, on the dataset, vit tracker demonstrates better performance compared to nanotrack. Moreover, vit trackerprovides confidence values during the tracking process, which can be used to determine if the tracking is currently lost. opencv_zoo: https://github.com/opencv/opencv_zoo/pull/194 opencv_extra: [https://github.com/opencv/opencv_extra/pull/1088](https://github.com/opencv/opencv_extra/pull/1088) # Performance comparison is as follows: NOTE: The speed below is tested by **onnxruntime** because opencv has poor support for the transformer architecture for now. ONNX speed test on ARM platform(apple M2)(ms): | thread nums | 1| 2| 3| 4| |--------|--------|--------|--------|--------| | nanotrack| 5.25| 4.86| 4.72| 4.49| | vit tracker| 4.18| 2.41| 1.97| **1.46 (3X)**| ONNX speed test on x86 platform(intel i3 10105)(ms): | thread nums | 1| 2| 3| 4| |--------|--------|--------|--------|--------| | nanotrack|3.20|2.75|2.46|2.55| | vit tracker|3.84|2.37|2.10|2.01| opencv speed test on x86 platform(intel i3 10105)(ms): | thread nums | 1| 2| 3| 4| |--------|--------|--------|--------|--------| | vit tracker|31.3|31.4|31.4|31.4| preformance test on lasot dataset(AUC is the most important data. Higher AUC means better tracker): |LASOT | AUC| P| Pnorm| |--------|--------|--------|--------| | nanotrack| 46.8| 45.0| 43.3| | vit tracker| 48.6| 44.8| 54.7| [https://youtu.be/MJiPnu1ZQRI](https://youtu.be/MJiPnu1ZQRI) In target tracking tasks, the score is an important indicator that can indicate whether the current target is lost. In the video, vit tracker can track the target and display the current score in the upper left corner of the video. When the target is lost, the score drops significantly. While nanotrack will only return 0.9 score in any situation, so that we cannot determine whether the target is lost. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2 years ago · 70d7e83dca
parent 320c0bf419
commit 70d7e83dca
5 changed files with 448 additions and 1 deletions
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@ -887,6 +887,43 @@ public:
    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
 };

+/** @brief the VIT tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  VIT tracker is much faster and extremely lightweight due to special model structure, the model file is about 767KB.
+ *  Model download link: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+ *  Author: PengyuLiu, 1872918507@qq.com
+ */
+class CV_EXPORTS_W TrackerVit : public Tracker
+{
+protected:
+    TrackerVit();  // use ::create()
+public:
+    virtual ~TrackerVit() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string net;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+        CV_PROP_RW Scalar meanvalue;
+        CV_PROP_RW Scalar stdvalue;
+    };
+
+    /** @brief Constructor
+    @param parameters vit tracker parameters TrackerVit::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerVit> create(const TrackerVit::Params& parameters = TrackerVit::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    // void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    // bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
 //! @} video_track

 } // cv
--- a/modules/video/src/tracking/tracker_vit.cpp
+++ b/modules/video/src/tracking/tracker_vit.cpp
@ -0,0 +1,219 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Author, PengyuLiu, 1872918507@qq.com
+
+#include "../precomp.hpp"
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+#endif
+
+namespace cv {
+
+TrackerVit::TrackerVit()
+{
+    // nothing
+}
+
+TrackerVit::~TrackerVit()
+{
+    // nothing
+}
+
+TrackerVit::Params::Params()
+{
+    net = "vitTracker.onnx";
+    meanvalue = Scalar{0.485, 0.456, 0.406};
+    stdvalue = Scalar{0.229, 0.224, 0.225};
+#ifdef HAVE_OPENCV_DNN
+    backend = dnn::DNN_BACKEND_DEFAULT;
+    target = dnn::DNN_TARGET_CPU;
+#else
+    backend = -1;  // invalid value
+    target = -1;  // invalid value
+#endif
+}
+
+#ifdef HAVE_OPENCV_DNN
+
+class TrackerVitImpl : public TrackerVit
+{
+public:
+    TrackerVitImpl(const TrackerVit::Params& parameters)
+        : params(parameters)
+    {
+        net = dnn::readNet(params.net);
+        CV_Assert(!net.empty());
+    }
+
+    void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE;
+    float getTrackingScore() CV_OVERRIDE;
+
+    Rect rect_last;
+    float tracking_score;
+
+    TrackerVit::Params params;
+
+
+protected:
+    void preprocess(const Mat& src, Mat& dst, Size size);
+
+    const Size searchSize{256, 256};
+    const Size templateSize{128, 128};
+
+    Mat hanningWindow;
+
+    dnn::Net net;
+    Mat image;
+};
+
+static void crop_image(const Mat& src, Mat& dst, Rect box, int factor)
+{
+    int x = box.x, y = box.y, w = box.width, h = box.height;
+    int crop_sz = ceil(sqrt(w * h) * factor);
+
+    int x1 = round(x + 0.5 * w - crop_sz * 0.5);
+    int x2 = x1 + crop_sz;
+    int y1 = round(y + 0.5 * h - crop_sz * 0.5);
+    int y2 = y1 + crop_sz;
+
+    int x1_pad = std::max(0, -x1);
+    int y1_pad = std::max(0, -y1);
+    int x2_pad = std::max(x2 - src.size[1] + 1, 0);
+    int y2_pad = std::max(y2 - src.size[0] + 1, 0);
+
+    Rect roi(x1 + x1_pad, y1 + y1_pad, x2 - x2_pad - x1 - x1_pad, y2 - y2_pad - y1 - y1_pad);
+    Mat im_crop = src(roi);
+    copyMakeBorder(im_crop, dst, y1_pad, y2_pad, x1_pad, x2_pad, BORDER_CONSTANT);
+}
+
+void TrackerVitImpl::preprocess(const Mat& src, Mat& dst, Size size)
+{
+    Mat mean = Mat(size, CV_32FC3, params.meanvalue);
+    Mat std = Mat(size, CV_32FC3, params.stdvalue);
+    mean = dnn::blobFromImage(mean, 1.0, Size(), Scalar(), false);
+    std = dnn::blobFromImage(std, 1.0, Size(), Scalar(), false);
+
+    Mat img;
+    resize(src, img, size);
+
+    dst = dnn::blobFromImage(img, 1.0, Size(), Scalar(), false);
+    dst /= 255;
+    dst = (dst - mean) / std;
+}
+
+static Mat hann1d(int sz, bool centered = true) {
+    Mat hanningWindow(sz, 1, CV_32FC1);
+    float* data = hanningWindow.ptr<float>(0);
+
+    if(centered) {
+        for(int i = 0; i < sz; i++) {
+            float val = 0.5 * (1 - std::cos((2 * M_PI / (sz + 1)) * (i + 1)));
+            data[i] = val;
+        }
+    }
+    else {
+        int half_sz = sz / 2;
+        for(int i = 0; i <= half_sz; i++) {
+            float val = 0.5 * (1 + std::cos((2 * M_PI / (sz + 2)) * i));
+            data[i] = val;
+            data[sz - 1 - i] = val;
+        }
+    }
+
+    return hanningWindow;
+}
+
+static Mat hann2d(Size size, bool centered = true) {
+    int rows = size.height;
+    int cols = size.width;
+
+    Mat hanningWindowRows = hann1d(rows, centered);
+    Mat hanningWindowCols = hann1d(cols, centered);
+
+    Mat hanningWindow = hanningWindowRows * hanningWindowCols.t();
+
+    return hanningWindow;
+}
+
+static Rect returnfromcrop(float x, float y, float w, float h, Rect res_Last)
+{
+    int cropwindowwh = 4 * sqrt(res_Last.width * res_Last.height);
+    int x0 = res_Last.x + 0.5 * res_Last.width - 0.5 * cropwindowwh;
+    int y0 = res_Last.y + 0.5 * res_Last.height - 0.5 * cropwindowwh;
+    Rect finalres;
+    finalres.x = x * cropwindowwh + x0;
+    finalres.y = y * cropwindowwh + y0;
+    finalres.width = w * cropwindowwh;
+    finalres.height = h * cropwindowwh;
+    return finalres;
+}
+
+void TrackerVitImpl::init(InputArray image_, const Rect &boundingBox_)
+{
+    image = image_.getMat().clone();
+    Mat crop;
+    crop_image(image, crop, boundingBox_, 2);
+    Mat blob;
+    preprocess(crop, blob, templateSize);
+    net.setInput(blob, "template");
+    Size size(16, 16);
+    hanningWindow = hann2d(size, false);
+    rect_last = boundingBox_;
+}
+
+bool TrackerVitImpl::update(InputArray image_, Rect &boundingBoxRes)
+{
+    image = image_.getMat().clone();
+    Mat crop;
+    crop_image(image, crop, rect_last, 4);
+    Mat blob;
+    preprocess(crop, blob, searchSize);
+    net.setInput(blob, "search");
+    std::vector<String> outputName = {"output1", "output2", "output3"};
+    std::vector<Mat> outs;
+    net.forward(outs, outputName);
+    CV_Assert(outs.size() == 3);
+
+    Mat conf_map = outs[0].reshape(0, {16, 16});
+    Mat size_map = outs[1].reshape(0, {2, 16, 16});
+    Mat offset_map = outs[2].reshape(0, {2, 16, 16});
+
+    multiply(conf_map, (1.0 - hanningWindow), conf_map);
+
+    double maxVal;
+    Point maxLoc;
+    minMaxLoc(conf_map, nullptr, &maxVal, nullptr, &maxLoc);
+    tracking_score = maxVal;
+
+    float cx = (maxLoc.x + offset_map.at<float>(0, maxLoc.y, maxLoc.x)) / 16;
+    float cy = (maxLoc.y + offset_map.at<float>(1, maxLoc.y, maxLoc.x)) / 16;
+    float w = size_map.at<float>(0, maxLoc.y, maxLoc.x);
+    float h = size_map.at<float>(1, maxLoc.y, maxLoc.x);
+
+    Rect finalres = returnfromcrop(cx - w / 2, cy - h / 2, w, h, rect_last);
+    rect_last = finalres;
+    boundingBoxRes = finalres;
+    return true;
+}
+
+float TrackerVitImpl::getTrackingScore()
+{
+    return tracking_score;
+}
+
+Ptr<TrackerVit> TrackerVit::create(const TrackerVit::Params& parameters)
+{
+    return makePtr<TrackerVitImpl>(parameters);
+}
+
+#else  // OPENCV_HAVE_DNN
+Ptr<TrackerVit> TrackerVit::create(const TrackerVit::Params& parameters)
+{
+    CV_UNUSED(parameters);
+    CV_Error(Error::StsNotImplemented, "to use vittrack, the tracking module needs to be built with opencv_dnn !");
+}
+#endif  // OPENCV_HAVE_DNN
+}
--- a/modules/video/test/test_trackers.cpp
+++ b/modules/video/test/test_trackers.cpp
@ -160,4 +160,13 @@ TEST(NanoTrack, accuracy_NanoTrack_V2)
    checkTrackingAccuracy(tracker, 0.69);
 }

+TEST(vittrack, accuracy_vittrack)
+{
+    std::string model = cvtest::findDataFile("dnn/onnx/models/vitTracker.onnx", false);
+    cv::TrackerVit::Params params;
+    params.net = model;
+    cv::Ptr<Tracker> tracker = TrackerVit::create(params);
+    checkTrackingAccuracy(tracker, 0.67);
+}
+
 }}  // namespace opencv_test::
--- a/samples/dnn/vit_tracker.cpp
+++ b/samples/dnn/vit_tracker.cpp
@ -0,0 +1,176 @@
+// VitTracker
+// model: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+
+#include <iostream>
+#include <cmath>
+
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+const char *keys =
+        "{ help     h  |   | Print help message }"
+        "{ input    i  |   | Full path to input video folder, the specific camera index. (empty for camera 0) }"
+        "{ net    | vitTracker.onnx | Path to onnx model of vitTracker.onnx}"
+        "{ backend     | 0 | Choose one of computation backends: "
+                            "0: automatically (by default), "
+                            "1: Halide language (http://halide-lang.org/), "
+                            "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                            "3: OpenCV implementation, "
+                            "4: VKCOM, "
+                            "5: CUDA },"
+        "{ target      | 0 | Choose one of target computation devices: "
+                            "0: CPU target (by default), "
+                            "1: OpenCL, "
+                            "2: OpenCL fp16 (half-float precision), "
+                            "3: VPU, "
+                            "4: Vulkan, "
+                            "6: CUDA, "
+                            "7: CUDA fp16 (half-float preprocess) }"
+;
+
+static
+int run(int argc, char** argv)
+{
+    // Parse command line arguments.
+    CommandLineParser parser(argc, argv, keys);
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    std::string inputName = parser.get<String>("input");
+    std::string net = parser.get<String>("net");
+    int backend = parser.get<int>("backend");
+    int target = parser.get<int>("target");
+
+    Ptr<TrackerVit> tracker;
+    try
+    {
+        TrackerVit::Params params;
+        params.net = samples::findFile(net);
+        params.backend = backend;
+        params.target = target;
+        tracker = TrackerVit::create(params);
+    }
+    catch (const cv::Exception& ee)
+    {
+        std::cerr << "Exception: " << ee.what() << std::endl;
+        std::cout << "Can't load the network by using the following files:" << std::endl;
+        std::cout << "net : " << net << std::endl;
+        return 2;
+    }
+
+    const std::string winName = "vitTracker";
+    namedWindow(winName, WINDOW_AUTOSIZE);
+
+    // Open a video file or an image file or a camera stream.
+    VideoCapture cap;
+
+    if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
+    {
+        int c = inputName.empty() ? 0 : inputName[0] - '0';
+        std::cout << "Trying to open camera #" << c << " ..." << std::endl;
+        if (!cap.open(c))
+        {
+            std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
+            return 2;
+        }
+    }
+    else if (inputName.size())
+    {
+        inputName = samples::findFileOrKeep(inputName);
+        if (!cap.open(inputName))
+        {
+            std::cout << "Could not open: " << inputName << std::endl;
+            return 2;
+        }
+    }
+
+    // Read the first image.
+    Mat image;
+    cap >> image;
+    if (image.empty())
+    {
+        std::cerr << "Can't capture frame!" << std::endl;
+        return 2;
+    }
+
+    Mat image_select = image.clone();
+    putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+    putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+    Rect selectRect = selectROI(winName, image_select);
+    std::cout << "ROI=" << selectRect << std::endl;
+
+    tracker->init(image, selectRect);
+
+    TickMeter tickMeter;
+
+    for (int count = 0; ; ++count)
+    {
+        cap >> image;
+        if (image.empty())
+        {
+            std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
+            break;
+        }
+
+        Rect rect;
+
+        tickMeter.start();
+        bool ok = tracker->update(image, rect);
+        tickMeter.stop();
+
+        float score = tracker->getTrackingScore();
+
+        std::cout << "frame " << count <<
+            ": predicted score=" << score <<
+            "  rect=" << rect <<
+            "  time=" << tickMeter.getTimeMilli() << "ms" <<
+            std::endl;
+
+        Mat render_image = image.clone();
+
+        if (ok)
+        {
+            rectangle(render_image, rect, Scalar(0, 255, 0), 2);
+
+            std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
+            std::string scoreLabel = format("Score: %f", score);
+            putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+            putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        }
+
+        imshow(winName, render_image);
+
+        tickMeter.reset();
+
+        int c = waitKey(1);
+        if (c == 27 /*ESC*/)
+            break;
+    }
+
+    std::cout << "Exit" << std::endl;
+    return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+    try
+    {
+        return run(argc, argv);
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
+        return 1;
+    }
+}
--- a/samples/python/tracker.py
+++ b/samples/python/tracker.py
@ -22,6 +22,7 @@ USAGE:
                    [--dasiamrpn_backend DASIAMRPN_BACKEND]
                    [--dasiamrpn_target DASIAMRPN_TARGET]
                    [--nanotrack_backbone NANOTRACK_BACKEND] [--nanotrack_headneck NANOTRACK_TARGET]
+                    [--vittrack_net VITTRACK_MODEL]
 '''

 # Python 2/3 compatibility
@ -61,6 +62,10 @@ class App(object):
            params.backbone = args.nanotrack_backbone
            params.neckhead = args.nanotrack_headneck
            tracker = cv.TrackerNano_create(params)
+        elif self.trackerAlgorithm == 'vittrack':
+            params = cv.TrackerVit_Params()
+            params.net = args.vittrack_net
+            tracker = cv.TrackerVit_create(params)
        else:
            sys.exit("Tracker {} is not recognized. Please use one of three available: mil, goturn, dasiamrpn, nanotrack.".format(self.trackerAlgorithm))
        return tracker
@ -126,7 +131,7 @@ if __name__ == '__main__':
    print(__doc__)
    parser = argparse.ArgumentParser(description="Run tracker")
    parser.add_argument("--input", type=str, default="vtest.avi", help="Path to video source")
-    parser.add_argument("--tracker_algo", type=str, default="nanotrack", help="One of available tracking algorithms: mil, goturn, dasiamrpn, nanotrack")
+    parser.add_argument("--tracker_algo", type=str, default="nanotrack", help="One of available tracking algorithms: mil, goturn, dasiamrpn, nanotrack, vittrack")
    parser.add_argument("--goturn", type=str, default="goturn.prototxt", help="Path to GOTURN architecture")
    parser.add_argument("--goturn_model", type=str, default="goturn.caffemodel", help="Path to GOTERN model")
    parser.add_argument("--dasiamrpn_net", type=str, default="dasiamrpn_model.onnx", help="Path to onnx model of DaSiamRPN net")
@ -134,6 +139,7 @@ if __name__ == '__main__':
    parser.add_argument("--dasiamrpn_kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Path to onnx model of DaSiamRPN kernel_cls1")
    parser.add_argument("--nanotrack_backbone", type=str, default="nanotrack_backbone_sim.onnx", help="Path to onnx model of NanoTrack backBone")
    parser.add_argument("--nanotrack_headneck", type=str, default="nanotrack_head_sim.onnx", help="Path to onnx model of NanoTrack headNeck")
+    parser.add_argument("--vittrack_net", type=str, default="vitTracker.onnx", help="Path to onnx model of  vittrack")

    args = parser.parse_args()
    App(args).run()