Merge pull request #25771 from fengyuentau:vittrack_black_input

video: fix vittrack in the case where crop size grows until out-of-memory when the input is black #25771


### Pull Request Readiness Checklist

See details at

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
Yuantao Feng 10 months ago committed by GitHub
parent 0fac5d52bc
commit e3884a9ea8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1
  2. 69
  3. 4
  4. 35

@ -920,6 +920,7 @@ public:
CV_PROP_RW int target;
CV_PROP_RW Scalar meanvalue;
CV_PROP_RW Scalar stdvalue;
CV_PROP_RW float tracking_score_threshold;
/** @brief Constructor

@ -24,8 +24,8 @@ TrackerVit::~TrackerVit()
net = "vitTracker.onnx";
meanvalue = Scalar{0.485, 0.456, 0.406};
stdvalue = Scalar{0.229, 0.224, 0.225};
meanvalue = Scalar{0.485, 0.456, 0.406}; // normalized mean (already divided by 255)
stdvalue = Scalar{0.229, 0.224, 0.225}; // normalized std (already divided by 255)
backend = dnn::DNN_BACKEND_DEFAULT;
target = dnn::DNN_TARGET_CPU;
@ -33,6 +33,7 @@ TrackerVit::Params::Params()
backend = -1; // invalid value
target = -1; // invalid value
tracking_score_threshold = 0.20f; // safe threshold to filter out black frames
@ -48,6 +49,9 @@ public:
i2bp.mean = params.meanvalue * 255.0;
i2bp.scalefactor = (1.0 / params.stdvalue) * (1 / 255.0);
void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
@ -58,6 +62,7 @@ public:
float tracking_score;
TrackerVit::Params params;
dnn::Image2BlobParams i2bp;
@ -69,10 +74,9 @@ protected:
Mat hanningWindow;
dnn::Net net;
Mat image;
static void crop_image(const Mat& src, Mat& dst, Rect box, int factor)
static int crop_image(const Mat& src, Mat& dst, Rect box, int factor)
int x = box.x, y = box.y, w = box.width, h = box.height;
int crop_sz = cvCeil(sqrt(w * h) * factor);
@ -90,21 +94,16 @@ static void crop_image(const Mat& src, Mat& dst, Rect box, int factor)
Rect roi(x1 + x1_pad, y1 + y1_pad, x2 - x2_pad - x1 - x1_pad, y2 - y2_pad - y1 - y1_pad);
Mat im_crop = src(roi);
copyMakeBorder(im_crop, dst, y1_pad, y2_pad, x1_pad, x2_pad, BORDER_CONSTANT);
return crop_sz;
void TrackerVitImpl::preprocess(const Mat& src, Mat& dst, Size size)
Mat mean = Mat(size, CV_32FC3, params.meanvalue);
Mat std = Mat(size, CV_32FC3, params.stdvalue);
mean = dnn::blobFromImage(mean, 1.0, Size(), Scalar(), false);
std = dnn::blobFromImage(std, 1.0, Size(), Scalar(), false);
Mat img;
resize(src, img, size);
dst = dnn::blobFromImage(img, 1.0, Size(), Scalar(), false);
dst /= 255;
dst = (dst - mean) / std;
dst = dnn::blobFromImageWithParams(img, i2bp);
static Mat hann1d(int sz, bool centered = true) {
@ -141,22 +140,21 @@ static Mat hann2d(Size size, bool centered = true) {
return hanningWindow;
static Rect returnfromcrop(float x, float y, float w, float h, Rect res_Last)
static void updateLastRect(float cx, float cy, float w, float h, int crop_size, Rect &rect_last)
int cropwindowwh = 4 * cvFloor(sqrt(res_Last.width * res_Last.height));
int x0 = res_Last.x + (res_Last.width - cropwindowwh) / 2;
int y0 = res_Last.y + (res_Last.height - cropwindowwh) / 2;
Rect finalres;
finalres.x = cvFloor(x * cropwindowwh + x0);
finalres.y = cvFloor(y * cropwindowwh + y0);
finalres.width = cvFloor(w * cropwindowwh);
finalres.height = cvFloor(h * cropwindowwh);
return finalres;
int x0 = rect_last.x + (rect_last.width - crop_size) / 2;
int y0 = rect_last.y + (rect_last.height - crop_size) / 2;
float x1 = cx - w / 2, y1 = cy - h / 2;
rect_last.x = cvFloor(x1 * crop_size + x0);
rect_last.y = cvFloor(y1 * crop_size + y0);
rect_last.width = cvFloor(w * crop_size);
rect_last.height = cvFloor(h * crop_size);
void TrackerVitImpl::init(InputArray image_, const Rect &boundingBox_)
image = image_.getMat().clone();
Mat image = image_.getMat();
Mat crop;
crop_image(image, crop, boundingBox_, 2);
Mat blob;
@ -169,9 +167,9 @@ void TrackerVitImpl::init(InputArray image_, const Rect &boundingBox_)
bool TrackerVitImpl::update(InputArray image_, Rect &boundingBoxRes)
image = image_.getMat().clone();
Mat image = image_.getMat();
Mat crop;
crop_image(image, crop, rect_last, 4);
int crop_size = crop_image(image, crop, rect_last, 4); // crop: [crop_size, crop_size]
Mat blob;
preprocess(crop, blob, searchSize);
net.setInput(blob, "search");
@ -191,15 +189,18 @@ bool TrackerVitImpl::update(InputArray image_, Rect &boundingBoxRes)
minMaxLoc(conf_map, nullptr, &maxVal, nullptr, &maxLoc);
tracking_score = static_cast<float>(maxVal);
float cx = (maxLoc.x +<float>(0, maxLoc.y, maxLoc.x)) / 16;
float cy = (maxLoc.y +<float>(1, maxLoc.y, maxLoc.x)) / 16;
float w =<float>(0, maxLoc.y, maxLoc.x);
float h =<float>(1, maxLoc.y, maxLoc.x);
Rect finalres = returnfromcrop(cx - w / 2, cy - h / 2, w, h, rect_last);
rect_last = finalres;
boundingBoxRes = finalres;
return true;
if (tracking_score >= params.tracking_score_threshold) {
float cx = (maxLoc.x +<float>(0, maxLoc.y, maxLoc.x)) / 16;
float cy = (maxLoc.y +<float>(1, maxLoc.y, maxLoc.x)) / 16;
float w =<float>(0, maxLoc.y, maxLoc.x);
float h =<float>(1, maxLoc.y, maxLoc.x);
updateLastRect(cx, cy, w, h, crop_size, rect_last);
boundingBoxRes = rect_last;
return true;
} else {
return false;
float TrackerVitImpl::getTrackingScore()

@ -166,9 +166,7 @@ TEST(vittrack, accuracy_vittrack)
cv::TrackerVit::Params params; = model;
cv::Ptr<Tracker> tracker = TrackerVit::create(params);
// NOTE: Test threshold was reduced from 0.67 (libjpeg-turbo) to 0.66 (libjpeg 9f),
// becase libjpeg and libjpeg-turbo produce slightly different images
checkTrackingAccuracy(tracker, 0.66);
checkTrackingAccuracy(tracker, 0.64);
}} // namespace opencv_test::

@ -16,6 +16,7 @@ const char *keys =
"{ help h | | Print help message }"
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
"{ net | vitTracker.onnx | Path to onnx model of vitTracker.onnx}"
"{ tracking_score_threshold t | 0.3 | Tracking score threshold. If a bbox of score >= 0.3, it is considered as found }"
"{ backend | 0 | Choose one of computation backends: "
"0: automatically (by default), "
"1: Halide language (, "
@ -49,6 +50,7 @@ int run(int argc, char** argv)
std::string net = parser.get<String>("net");
int backend = parser.get<int>("backend");
int target = parser.get<int>("target");
float tracking_score_threshold = parser.get<float>("tracking_score_threshold");
Ptr<TrackerVit> tracker;
@ -57,6 +59,7 @@ int run(int argc, char** argv) = samples::findFile(net);
params.backend = backend; = target;
params.tracking_score_threshold = tracking_score_threshold;
tracker = TrackerVit::create(params);
catch (const cv::Exception& ee)
@ -108,6 +111,11 @@ int run(int argc, char** argv)
Rect selectRect = selectROI(winName, image_select);
std::cout << "ROI=" << selectRect << std::endl;
if (selectRect.empty())
std::cerr << "Invalid ROI!" << std::endl;
return 2;
tracker->init(image, selectRect);
@ -130,30 +138,29 @@ int run(int argc, char** argv)
float score = tracker->getTrackingScore();
std::cout << "frame " << count <<
": predicted score=" << score <<
" rect=" << rect <<
" time=" << tickMeter.getTimeMilli() << "ms" <<
std::cout << "frame " << count;
if (ok) {
std::cout << ": predicted score=" << score <<
"\trect=" << rect <<
"\ttime=" << tickMeter.getTimeMilli() << "ms" << std::endl;
Mat render_image = image.clone();
if (ok)
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
rectangle(image, rect, Scalar(0, 255, 0), 2);
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
std::string scoreLabel = format("Score: %f", score);
putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
} else {
std::cout << ": target lost" << std::endl;
putText(image, "Target lost", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 255));
imshow(winName, render_image);
imshow(winName, image);
int c = waitKey(1);
if (c == 27 /*ESC*/)
if (c == 27 /*ESC*/ || c == 'q' || c == 'Q')
