parent
667f5b655a
commit
e8d94ea87c
14 changed files with 555 additions and 995 deletions
@ -0,0 +1,20 @@ |
||||
# OpenCV deep learning module samples |
||||
|
||||
## Model Zoo |
||||
|
||||
### Object detection |
||||
|
||||
| Model | Scale | Size WxH| Mean subtraction | Channels order | |
||||
|---------------|-------|-----------|--------------------|-------| |
||||
| [MobileNet-SSD, Caffe](https://github.com/chuanqi305/MobileNet-SSD/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | BGR | |
||||
| [OpenCV face detector](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector) | `1.0` | `300x300` | `104 177 123` | BGR | |
||||
| [SSDs from TensorFlow](https://github.com/tensorflow/models/tree/master/research/object_detection/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | RGB | |
||||
| [YOLO](https://pjreddie.com/darknet/yolo/) | `0.00392 (1/255)` | `416x416` | `0 0 0` | RGB | |
||||
| [VGG16-SSD](https://github.com/weiliu89/caffe/tree/ssd) | `1.0` | `300x300` | `104 117 123` | BGR | |
||||
| [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn) | `1.0` | `800x600` | `102.9801, 115.9465, 122.7717` | BGR | |
||||
| [R-FCN](https://github.com/YuwenXiong/py-R-FCN) | `1.0` | `800x600` | `102.9801 115.9465 122.7717` | BGR | |
||||
|
||||
## References |
||||
* [Models downloading script](https://github.com/opencv/opencv_extra/blob/master/testdata/dnn/download_models.py) |
||||
* [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn) |
||||
* [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API) |
@ -1,93 +0,0 @@ |
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/dnn/all_layers.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
|
||||
using namespace cv; |
||||
using namespace dnn; |
||||
|
||||
const char* keys = |
||||
"{ help h | | print help message }" |
||||
"{ proto p | | path to .prototxt }" |
||||
"{ model m | | path to .caffemodel }" |
||||
"{ image i | | path to input image }" |
||||
"{ conf c | 0.8 | minimal confidence }"; |
||||
|
||||
const char* classNames[] = { |
||||
"__background__", |
||||
"aeroplane", "bicycle", "bird", "boat", |
||||
"bottle", "bus", "car", "cat", "chair", |
||||
"cow", "diningtable", "dog", "horse", |
||||
"motorbike", "person", "pottedplant", |
||||
"sheep", "sofa", "train", "tvmonitor" |
||||
}; |
||||
|
||||
static const int kInpWidth = 800; |
||||
static const int kInpHeight = 600; |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys); |
||||
parser.about("This sample is used to run Faster-RCNN and R-FCN object detection " |
||||
"models with OpenCV. You can get required models from " |
||||
"https://github.com/rbgirshick/py-faster-rcnn (Faster-RCNN) and from " |
||||
"https://github.com/YuwenXiong/py-R-FCN (R-FCN). Corresponding .prototxt " |
||||
"files may be found at https://github.com/opencv/opencv_extra/tree/master/testdata/dnn."); |
||||
if (argc == 1 || parser.has("help")) |
||||
{ |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
String protoPath = parser.get<String>("proto"); |
||||
String modelPath = parser.get<String>("model"); |
||||
String imagePath = parser.get<String>("image"); |
||||
float confThreshold = parser.get<float>("conf"); |
||||
CV_Assert(!protoPath.empty(), !modelPath.empty(), !imagePath.empty()); |
||||
|
||||
// Load a model.
|
||||
Net net = readNetFromCaffe(protoPath, modelPath); |
||||
|
||||
Mat img = imread(imagePath); |
||||
resize(img, img, Size(kInpWidth, kInpHeight)); |
||||
Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false); |
||||
Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f); |
||||
|
||||
net.setInput(blob, "data"); |
||||
net.setInput(imInfo, "im_info"); |
||||
|
||||
// Draw detections.
|
||||
Mat detections = net.forward(); |
||||
const float* data = (float*)detections.data; |
||||
for (size_t i = 0; i < detections.total(); i += 7) |
||||
{ |
||||
// An every detection is a vector [id, classId, confidence, left, top, right, bottom]
|
||||
float confidence = data[i + 2]; |
||||
if (confidence > confThreshold) |
||||
{ |
||||
int classId = (int)data[i + 1]; |
||||
int left = max(0, min((int)data[i + 3], img.cols - 1)); |
||||
int top = max(0, min((int)data[i + 4], img.rows - 1)); |
||||
int right = max(0, min((int)data[i + 5], img.cols - 1)); |
||||
int bottom = max(0, min((int)data[i + 6], img.rows - 1)); |
||||
|
||||
// Draw a bounding box.
|
||||
rectangle(img, Point(left, top), Point(right, bottom), Scalar(0, 255, 0)); |
||||
|
||||
// Put a label with a class name and confidence.
|
||||
String label = cv::format("%s, %.3f", classNames[classId], confidence); |
||||
int baseLine; |
||||
Size labelSize = cv::getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
|
||||
top = max(top, labelSize.height); |
||||
rectangle(img, Point(left, top - labelSize.height), |
||||
Point(left + labelSize.width, top + baseLine), |
||||
Scalar(255, 255, 255), FILLED); |
||||
putText(img, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0)); |
||||
} |
||||
} |
||||
imshow("frame", img); |
||||
waitKey(); |
||||
return 0; |
||||
} |
@ -1,132 +0,0 @@ |
||||
# This script is used to demonstrate MobileNet-SSD network using OpenCV deep learning module. |
||||
# |
||||
# It works with model taken from https://github.com/chuanqi305/MobileNet-SSD/ that |
||||
# was trained in Caffe-SSD framework, https://github.com/weiliu89/caffe/tree/ssd. |
||||
# Model detects objects from 20 classes. |
||||
# |
||||
# Also TensorFlow model from TensorFlow object detection model zoo may be used to |
||||
# detect objects from 90 classes: |
||||
# http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz |
||||
# Text graph definition must be taken from opencv_extra: |
||||
# https://github.com/opencv/opencv_extra/tree/master/testdata/dnn/ssd_mobilenet_v1_coco.pbtxt |
||||
import numpy as np |
||||
import argparse |
||||
|
||||
try: |
||||
import cv2 as cv |
||||
except ImportError: |
||||
raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, ' |
||||
'configure environment variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)') |
||||
|
||||
inWidth = 300 |
||||
inHeight = 300 |
||||
WHRatio = inWidth / float(inHeight) |
||||
inScaleFactor = 0.007843 |
||||
meanVal = 127.5 |
||||
|
||||
if __name__ == "__main__": |
||||
parser = argparse.ArgumentParser( |
||||
description='Script to run MobileNet-SSD object detection network ' |
||||
'trained either in Caffe or TensorFlow frameworks.') |
||||
parser.add_argument("--video", help="path to video file. If empty, camera's stream will be used") |
||||
parser.add_argument("--prototxt", default="MobileNetSSD_deploy.prototxt", |
||||
help='Path to text network file: ' |
||||
'MobileNetSSD_deploy.prototxt for Caffe model or ' |
||||
'ssd_mobilenet_v1_coco.pbtxt from opencv_extra for TensorFlow model') |
||||
parser.add_argument("--weights", default="MobileNetSSD_deploy.caffemodel", |
||||
help='Path to weights: ' |
||||
'MobileNetSSD_deploy.caffemodel for Caffe model or ' |
||||
'frozen_inference_graph.pb from TensorFlow.') |
||||
parser.add_argument("--num_classes", default=20, type=int, |
||||
help="Number of classes. It's 20 for Caffe model from " |
||||
"https://github.com/chuanqi305/MobileNet-SSD/ and 90 for " |
||||
"TensorFlow model from http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz") |
||||
parser.add_argument("--thr", default=0.2, type=float, help="confidence threshold to filter out weak detections") |
||||
args = parser.parse_args() |
||||
|
||||
if args.num_classes == 20: |
||||
net = cv.dnn.readNetFromCaffe(args.prototxt, args.weights) |
||||
swapRB = False |
||||
classNames = { 0: 'background', |
||||
1: 'aeroplane', 2: 'bicycle', 3: 'bird', 4: 'boat', |
||||
5: 'bottle', 6: 'bus', 7: 'car', 8: 'cat', 9: 'chair', |
||||
10: 'cow', 11: 'diningtable', 12: 'dog', 13: 'horse', |
||||
14: 'motorbike', 15: 'person', 16: 'pottedplant', |
||||
17: 'sheep', 18: 'sofa', 19: 'train', 20: 'tvmonitor' } |
||||
else: |
||||
assert(args.num_classes == 90) |
||||
net = cv.dnn.readNetFromTensorflow(args.weights, args.prototxt) |
||||
swapRB = True |
||||
classNames = { 0: 'background', |
||||
1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', |
||||
7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', |
||||
13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', |
||||
18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', |
||||
24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', |
||||
32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard', |
||||
37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', |
||||
41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', |
||||
46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', |
||||
51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', |
||||
56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', |
||||
61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', |
||||
67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse', |
||||
75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', |
||||
80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock', |
||||
86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush' } |
||||
|
||||
if args.video: |
||||
cap = cv.VideoCapture(args.video) |
||||
else: |
||||
cap = cv.VideoCapture(0) |
||||
|
||||
while True: |
||||
# Capture frame-by-frame |
||||
ret, frame = cap.read() |
||||
blob = cv.dnn.blobFromImage(frame, inScaleFactor, (inWidth, inHeight), (meanVal, meanVal, meanVal), swapRB) |
||||
net.setInput(blob) |
||||
detections = net.forward() |
||||
|
||||
cols = frame.shape[1] |
||||
rows = frame.shape[0] |
||||
|
||||
if cols / float(rows) > WHRatio: |
||||
cropSize = (int(rows * WHRatio), rows) |
||||
else: |
||||
cropSize = (cols, int(cols / WHRatio)) |
||||
|
||||
y1 = int((rows - cropSize[1]) / 2) |
||||
y2 = y1 + cropSize[1] |
||||
x1 = int((cols - cropSize[0]) / 2) |
||||
x2 = x1 + cropSize[0] |
||||
frame = frame[y1:y2, x1:x2] |
||||
|
||||
cols = frame.shape[1] |
||||
rows = frame.shape[0] |
||||
|
||||
for i in range(detections.shape[2]): |
||||
confidence = detections[0, 0, i, 2] |
||||
if confidence > args.thr: |
||||
class_id = int(detections[0, 0, i, 1]) |
||||
|
||||
xLeftBottom = int(detections[0, 0, i, 3] * cols) |
||||
yLeftBottom = int(detections[0, 0, i, 4] * rows) |
||||
xRightTop = int(detections[0, 0, i, 5] * cols) |
||||
yRightTop = int(detections[0, 0, i, 6] * rows) |
||||
|
||||
cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop), |
||||
(0, 255, 0)) |
||||
if class_id in classNames: |
||||
label = classNames[class_id] + ": " + str(confidence) |
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) |
||||
|
||||
yLeftBottom = max(yLeftBottom, labelSize[1]) |
||||
cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]), |
||||
(xLeftBottom + labelSize[0], yLeftBottom + baseLine), |
||||
(255, 255, 255), cv.FILLED) |
||||
cv.putText(frame, label, (xLeftBottom, yLeftBottom), |
||||
cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) |
||||
|
||||
cv.imshow("detections", frame) |
||||
if cv.waitKey(1) >= 0: |
||||
break |
@ -0,0 +1,255 @@ |
||||
#include <opencv2/opencv.hpp> |
||||
#include <fstream> |
||||
#include <iostream> |
||||
#include <sstream> |
||||
|
||||
const char* keys = |
||||
"{ help h | | Print help message. }" |
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" |
||||
"{ model m | | Path to a binary file of model contains trained weights. " |
||||
"It could be a file with extensions .caffemodel (Caffe), " |
||||
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }" |
||||
"{ config c | | Path to a text file of model contains network configuration. " |
||||
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }" |
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }" |
||||
"{ classes | | Optional path to a text file with names of classes to label detected objects. }" |
||||
"{ mean | | Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces. }" |
||||
"{ scale | 1 | Preprocess input image by multiplying on a scale factor. }" |
||||
"{ width | -1 | Preprocess input image by resizing to a specific width. }" |
||||
"{ height | -1 | Preprocess input image by resizing to a specific height. }" |
||||
"{ rgb | | Indicate that model works with RGB input images instead BGR ones. }" |
||||
"{ thr | .5 | Confidence threshold. }" |
||||
"{ opencl | | Enable OpenCL }"; |
||||
|
||||
using namespace cv; |
||||
using namespace dnn; |
||||
|
||||
float confThreshold; |
||||
std::vector<std::string> classes; |
||||
|
||||
void loadClasses(const std::string& file); |
||||
|
||||
Net readNet(const std::string& model, const std::string& config = "", const std::string& framework = ""); |
||||
|
||||
void postprocess(Mat& frame, const Mat& out, Net& net); |
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame); |
||||
|
||||
void callback(int pos, void* userdata); |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
CommandLineParser parser(argc, argv, keys); |
||||
parser.about("Use this script to run object detection deep learning networks using OpenCV."); |
||||
if (argc == 1 || parser.has("help")) |
||||
{ |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
confThreshold = parser.get<float>("thr"); |
||||
float scale = parser.get<float>("scale"); |
||||
bool swapRB = parser.get<bool>("rgb"); |
||||
int inpWidth = parser.get<int>("width"); |
||||
int inpHeight = parser.get<int>("height"); |
||||
|
||||
// Parse mean values.
|
||||
Scalar mean; |
||||
if (parser.has("mean")) |
||||
{ |
||||
std::istringstream meanStr(parser.get<String>("mean")); |
||||
std::vector<float> meanValues; |
||||
float val; |
||||
while (meanStr >> val) |
||||
meanValues.push_back(val); |
||||
CV_Assert(meanValues.size() == 3); |
||||
mean = Scalar(meanValues[0], meanValues[1], meanValues[2]); |
||||
} |
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes")) |
||||
{ |
||||
std::string file = parser.get<String>("classes"); |
||||
std::ifstream ifs(file.c_str()); |
||||
if (!ifs.is_open()) |
||||
CV_Error(Error::StsError, "File " + file + " not found"); |
||||
std::string line; |
||||
while (ifs >> line) |
||||
{ |
||||
classes.push_back(line); |
||||
} |
||||
} |
||||
|
||||
// Load a model.
|
||||
CV_Assert(parser.has("model")); |
||||
Net net = readNet(parser.get<String>("model"), parser.get<String>("config"), parser.get<String>("framework")); |
||||
|
||||
if (parser.get<bool>("opencl")) |
||||
{ |
||||
net.setPreferableTarget(DNN_TARGET_OPENCL); |
||||
} |
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning object detection in OpenCV"; |
||||
namedWindow(kWinName, WINDOW_NORMAL); |
||||
int initialConf = confThreshold * 100; |
||||
createTrackbar("Confidence threshold", kWinName, &initialConf, 99, callback); |
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap; |
||||
if (parser.has("input")) |
||||
cap.open(parser.get<String>("input")); |
||||
else |
||||
cap.open(0); |
||||
|
||||
// Process frames.
|
||||
Mat frame, blob; |
||||
while (waitKey(1) < 0) |
||||
{ |
||||
cap >> frame; |
||||
if (frame.empty()) |
||||
{ |
||||
waitKey(); |
||||
break; |
||||
} |
||||
|
||||
// Create a 4D blob from a frame.
|
||||
Size inpSize(inpWidth > 0 ? inpWidth : frame.cols, |
||||
inpHeight > 0 ? inpHeight : frame.rows); |
||||
blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false); |
||||
|
||||
// Run a model.
|
||||
net.setInput(blob); |
||||
if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
|
||||
{ |
||||
resize(frame, frame, inpSize); |
||||
Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f); |
||||
net.setInput(imInfo, "im_info"); |
||||
} |
||||
Mat out = net.forward(); |
||||
|
||||
postprocess(frame, out, net); |
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes; |
||||
double t = net.getPerfProfile(layersTimes); |
||||
std::string label = format("Inference time: %.2f", t * 1000 / getTickFrequency()); |
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar()); |
||||
|
||||
imshow(kWinName, frame); |
||||
} |
||||
return 0; |
||||
} |
||||
|
||||
void postprocess(Mat& frame, const Mat& out, Net& net) |
||||
{ |
||||
static std::vector<int> outLayers = net.getUnconnectedOutLayers(); |
||||
static std::string outLayerType = net.getLayer(outLayers[0])->type; |
||||
|
||||
float* data = (float*)out.data; |
||||
if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
|
||||
{ |
||||
// Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
// detections and an every detection is a vector of values
|
||||
// [batchId, classId, confidence, left, top, right, bottom]
|
||||
for (size_t i = 0; i < out.total(); i += 7) |
||||
{ |
||||
float confidence = data[i + 2]; |
||||
if (confidence > confThreshold) |
||||
{ |
||||
int left = data[i + 3]; |
||||
int top = data[i + 4]; |
||||
int right = data[i + 5]; |
||||
int bottom = data[i + 6]; |
||||
int classId = (int)(data[i + 1]) - 1; // Skip 0th background class id.
|
||||
drawPred(classId, confidence, left, top, right, bottom, frame); |
||||
} |
||||
} |
||||
} |
||||
else if (outLayerType == "DetectionOutput") |
||||
{ |
||||
// Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
// detections and an every detection is a vector of values
|
||||
// [batchId, classId, confidence, left, top, right, bottom]
|
||||
for (size_t i = 0; i < out.total(); i += 7) |
||||
{ |
||||
float confidence = data[i + 2]; |
||||
if (confidence > confThreshold) |
||||
{ |
||||
int left = (int)(data[i + 3] * frame.cols); |
||||
int top = (int)(data[i + 4] * frame.rows); |
||||
int right = (int)(data[i + 5] * frame.cols); |
||||
int bottom = (int)(data[i + 6] * frame.rows); |
||||
int classId = (int)(data[i + 1]) - 1; // Skip 0th background class id.
|
||||
drawPred(classId, confidence, left, top, right, bottom, frame); |
||||
} |
||||
} |
||||
} |
||||
else if (outLayerType == "Region") |
||||
{ |
||||
// Network produces output blob with a shape NxC where N is a number of
|
||||
// detected objects and C is a number of classes + 4 where the first 4
|
||||
// numbers are [center_x, center_y, width, height]
|
||||
for (int i = 0; i < out.rows; ++i, data += out.cols) |
||||
{ |
||||
Mat confidences = out.row(i).colRange(5, out.cols); |
||||
Point classIdPoint; |
||||
double confidence; |
||||
minMaxLoc(confidences, 0, &confidence, 0, &classIdPoint); |
||||
if (confidence > confThreshold) |
||||
{ |
||||
int classId = classIdPoint.x; |
||||
int centerX = (int)(data[0] * frame.cols); |
||||
int centerY = (int)(data[1] * frame.rows); |
||||
int width = (int)(data[2] * frame.cols); |
||||
int height = (int)(data[3] * frame.rows); |
||||
int left = centerX - width / 2; |
||||
int top = centerY - height / 2; |
||||
drawPred(classId, confidence, left, top, left + width, top + height, frame); |
||||
} |
||||
} |
||||
} |
||||
else |
||||
CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType); |
||||
} |
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) |
||||
{ |
||||
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0)); |
||||
|
||||
std::string label = format("%.2f", conf); |
||||
if (!classes.empty()) |
||||
{ |
||||
CV_Assert(classId < (int)classes.size()); |
||||
label = classes[classId] + ": " + label; |
||||
} |
||||
|
||||
int baseLine; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
|
||||
top = max(top, labelSize.height); |
||||
rectangle(frame, Point(left, top - labelSize.height), |
||||
Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED); |
||||
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar()); |
||||
} |
||||
|
||||
void callback(int pos, void*) |
||||
{ |
||||
confThreshold = pos * 0.01; |
||||
} |
||||
|
||||
Net readNet(const std::string& model, const std::string& config, const std::string& framework) |
||||
{ |
||||
std::string modelExt = model.substr(model.find('.')); |
||||
if (framework == "caffe" || modelExt == ".caffemodel") |
||||
return readNetFromCaffe(config, model); |
||||
else if (framework == "tensorflow" || modelExt == ".pb") |
||||
return readNetFromTensorflow(model, config); |
||||
else if (framework == "torch" || modelExt == ".t7" || modelExt == ".net") |
||||
return readNetFromTorch(model); |
||||
else if (framework == "darknet" || modelExt == ".weights") |
||||
return readNetFromDarknet(config, model); |
||||
else |
||||
CV_Error(Error::StsError, "Cannot determine an origin framework of model from file " + model); |
||||
return Net(); |
||||
} |
@ -0,0 +1,161 @@ |
||||
import cv2 as cv |
||||
import argparse |
||||
import sys |
||||
import numpy as np |
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run object detection deep learning networks using OpenCV.') |
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.') |
||||
parser.add_argument('--model', required=True, |
||||
help='Path to a binary file of model contains trained weights. ' |
||||
'It could be a file with extensions .caffemodel (Caffe), ' |
||||
'.pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet)') |
||||
parser.add_argument('--config', |
||||
help='Path to a text file of model contains network configuration. ' |
||||
'It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet)') |
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'], |
||||
help='Optional name of an origin framework of the model. ' |
||||
'Detect it automatically if it does not set.') |
||||
parser.add_argument('--classes', help='Optional path to a text file with names of classes to label detected objects.') |
||||
parser.add_argument('--mean', nargs='+', type=float, default=[0, 0, 0], |
||||
help='Preprocess input image by subtracting mean values. ' |
||||
'Mean values should be in BGR order.') |
||||
parser.add_argument('--scale', type=float, default=1.0, |
||||
help='Preprocess input image by multiplying on a scale factor.') |
||||
parser.add_argument('--width', type=int, |
||||
help='Preprocess input image by resizing to a specific width.') |
||||
parser.add_argument('--height', type=int, |
||||
help='Preprocess input image by resizing to a specific height.') |
||||
parser.add_argument('--rgb', action='store_true', |
||||
help='Indicate that model works with RGB input images instead BGR ones.') |
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold') |
||||
args = parser.parse_args() |
||||
|
||||
# Load names of classes |
||||
classes = None |
||||
if args.classes: |
||||
with open(args.classes, 'rt') as f: |
||||
classes = f.read().rstrip('\n').split('\n') |
||||
|
||||
# Load a network |
||||
modelExt = args.model[args.model.find('.'):] |
||||
if args.framework == 'caffe' or modelExt == '.caffemodel': |
||||
net = cv.dnn.readNetFromCaffe(args.config, args.model) |
||||
elif args.framework == 'tensorflow' or modelExt == '.pb': |
||||
net = cv.dnn.readNetFromTensorflow(args.model, args.config) |
||||
elif args.framework == 'torch' or modelExt in ['.t7', '.net']: |
||||
net = cv.dnn.readNetFromTorch(args.model) |
||||
elif args.framework == 'darknet' or modelExt == '.weights': |
||||
net = cv.dnn.readNetFromDarknet(args.config, args.model) |
||||
else: |
||||
print('Cannot determine an origin framework of model from file %s' % args.model) |
||||
sys.exit(0) |
||||
|
||||
confThreshold = args.thr |
||||
|
||||
def postprocess(frame, out): |
||||
frameHeight = frame.shape[0] |
||||
frameWidth = frame.shape[1] |
||||
|
||||
def drawPred(classId, conf, left, top, right, bottom): |
||||
# Draw a bounding box. |
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0)) |
||||
|
||||
label = '%.2f' % confidence |
||||
|
||||
# Print a label of class. |
||||
if classes: |
||||
assert(classId < len(classes)) |
||||
label = '%s: %s' % (classes[classId], label) |
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) |
||||
top = max(top, labelSize[1]) |
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED) |
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) |
||||
|
||||
layerNames = net.getLayerNames() |
||||
lastLayerId = net.getLayerId(layerNames[-1]) |
||||
lastLayer = net.getLayer(lastLayerId) |
||||
|
||||
if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN |
||||
# Network produces output blob with a shape 1x1xNx7 where N is a number of |
||||
# detections and an every detection is a vector of values |
||||
# [batchId, classId, confidence, left, top, right, bottom] |
||||
for detection in out[0, 0]: |
||||
confidence = detection[2] |
||||
if confidence > confThreshold: |
||||
left = int(detection[3]) |
||||
top = int(detection[4]) |
||||
right = int(detection[5]) |
||||
bottom = int(detection[6]) |
||||
classId = int(detection[1]) - 1 # Skip background label |
||||
drawPred(classId, confidence, left, top, right, bottom) |
||||
elif lastLayer.type == 'DetectionOutput': |
||||
# Network produces output blob with a shape 1x1xNx7 where N is a number of |
||||
# detections and an every detection is a vector of values |
||||
# [batchId, classId, confidence, left, top, right, bottom] |
||||
for detection in out[0, 0]: |
||||
confidence = detection[2] |
||||
if confidence > confThreshold: |
||||
left = int(detection[3] * frameWidth) |
||||
top = int(detection[4] * frameHeight) |
||||
right = int(detection[5] * frameWidth) |
||||
bottom = int(detection[6] * frameHeight) |
||||
classId = int(detection[1]) - 1 # Skip background label |
||||
drawPred(classId, confidence, left, top, right, bottom) |
||||
elif lastLayer.type == 'Region': |
||||
# Network produces output blob with a shape NxC where N is a number of |
||||
# detected objects and C is a number of classes + 4 where the first 4 |
||||
# numbers are [center_x, center_y, width, height] |
||||
for detection in out: |
||||
confidences = detection[5:] |
||||
classId = np.argmax(confidences) |
||||
confidence = confidences[classId] |
||||
if confidence > confThreshold: |
||||
center_x = int(detection[0] * frameWidth) |
||||
center_y = int(detection[1] * frameHeight) |
||||
width = int(detection[2] * frameWidth) |
||||
height = int(detection[3] * frameHeight) |
||||
left = center_x - width / 2 |
||||
top = center_y - height / 2 |
||||
drawPred(classId, confidence, left, top, left + width, top + height) |
||||
|
||||
# Process inputs |
||||
winName = 'Deep learning object detection in OpenCV' |
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL) |
||||
|
||||
def callback(pos): |
||||
global confThreshold |
||||
confThreshold = pos / 100.0 |
||||
|
||||
cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback) |
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0) |
||||
while cv.waitKey(1) < 0: |
||||
hasFrame, frame = cap.read() |
||||
if not hasFrame: |
||||
cv.waitKey() |
||||
break |
||||
|
||||
frameHeight = frame.shape[0] |
||||
frameWidth = frame.shape[1] |
||||
|
||||
# Create a 4D blob from a frame. |
||||
inpWidth = args.width if args.width else frameWidth |
||||
inpHeight = args.height if args.height else frameHeight |
||||
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False) |
||||
|
||||
# Run a model |
||||
net.setInput(blob) |
||||
if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN |
||||
frame = cv.resize(frame, (inpWidth, inpHeight)) |
||||
net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info'); |
||||
out = net.forward() |
||||
|
||||
postprocess(frame, out) |
||||
|
||||
# Put efficiency information. |
||||
t, _ = net.getPerfProfile() |
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) |
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) |
||||
|
||||
cv.imshow(winName, frame) |
@ -0,0 +1,90 @@ |
||||
person |
||||
bicycle |
||||
car |
||||
motorcycle |
||||
airplane |
||||
bus |
||||
train |
||||
truck |
||||
boat |
||||
traffic light |
||||
fire hydrant |
||||
|
||||
stop sign |
||||
parking meter |
||||
bench |
||||
bird |
||||
cat |
||||
dog |
||||
horse |
||||
sheep |
||||
cow |
||||
elephant |
||||
bear |
||||
zebra |
||||
giraffe |
||||
|
||||
backpack |
||||
umbrella |
||||
|
||||
|
||||
handbag |
||||
tie |
||||
suitcase |
||||
frisbee |
||||
skis |
||||
snowboard |
||||
sports ball |
||||
kite |
||||
baseball bat |
||||
baseball glove |
||||
skateboard |
||||
surfboard |
||||
tennis racket |
||||
bottle |
||||
|
||||
wine glass |
||||
cup |
||||
fork |
||||
knife |
||||
spoon |
||||
bowl |
||||
banana |
||||
apple |
||||
sandwich |
||||
orange |
||||
broccoli |
||||
carrot |
||||
hot dog |
||||
pizza |
||||
donut |
||||
cake |
||||
chair |
||||
couch |
||||
potted plant |
||||
bed |
||||
|
||||
dining table |
||||
|
||||
|
||||
toilet |
||||
|
||||
tv |
||||
laptop |
||||
mouse |
||||
remote |
||||
keyboard |
||||
cell phone |
||||
microwave |
||||
oven |
||||
toaster |
||||
sink |
||||
refrigerator |
||||
|
||||
book |
||||
clock |
||||
vase |
||||
scissors |
||||
teddy bear |
||||
hair drier |
||||
toothbrush |
@ -0,0 +1,20 @@ |
||||
aeroplane |
||||
bicycle |
||||
bird |
||||
boat |
||||
bottle |
||||
bus |
||||
car |
||||
cat |
||||
chair |
||||
cow |
||||
diningtable |
||||
dog |
||||
horse |
||||
motorbike |
||||
person |
||||
pottedplant |
||||
sheep |
||||
sofa |
||||
train |
||||
tvmonitor |
@ -1,164 +0,0 @@ |
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <iostream> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
using namespace cv::dnn; |
||||
|
||||
const size_t inWidth = 300; |
||||
const size_t inHeight = 300; |
||||
const double inScaleFactor = 1.0; |
||||
const Scalar meanVal(104.0, 177.0, 123.0); |
||||
|
||||
const char* about = "This sample uses Single-Shot Detector " |
||||
"(https://arxiv.org/abs/1512.02325) " |
||||
"with ResNet-10 architecture to detect faces on camera/video/image.\n" |
||||
"More information about the training is available here: " |
||||
"<OPENCV_SRC_DIR>/samples/dnn/face_detector/how_to_train_face_detector.txt\n" |
||||
".caffemodel model's file is available here: " |
||||
"<OPENCV_SRC_DIR>/samples/dnn/face_detector/res10_300x300_ssd_iter_140000.caffemodel\n" |
||||
".prototxt file is available here: " |
||||
"<OPENCV_SRC_DIR>/samples/dnn/face_detector/deploy.prototxt\n"; |
||||
|
||||
const char* params |
||||
= "{ help | false | print usage }" |
||||
"{ proto | | model configuration (deploy.prototxt) }" |
||||
"{ model | | model weights (res10_300x300_ssd_iter_140000.caffemodel) }" |
||||
"{ camera_device | 0 | camera device number }" |
||||
"{ video | | video or image for detection }" |
||||
"{ opencl | false | enable OpenCL }" |
||||
"{ min_confidence | 0.5 | min confidence }"; |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
CommandLineParser parser(argc, argv, params); |
||||
|
||||
if (parser.get<bool>("help")) |
||||
{ |
||||
cout << about << endl; |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
String modelConfiguration = parser.get<string>("proto"); |
||||
String modelBinary = parser.get<string>("model"); |
||||
|
||||
//! [Initialize network]
|
||||
dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary); |
||||
//! [Initialize network]
|
||||
|
||||
if (net.empty()) |
||||
{ |
||||
cerr << "Can't load network by using the following files: " << endl; |
||||
cerr << "prototxt: " << modelConfiguration << endl; |
||||
cerr << "caffemodel: " << modelBinary << endl; |
||||
cerr << "Models are available here:" << endl; |
||||
cerr << "<OPENCV_SRC_DIR>/samples/dnn/face_detector" << endl; |
||||
cerr << "or here:" << endl; |
||||
cerr << "https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector" << endl; |
||||
exit(-1); |
||||
} |
||||
|
||||
if (parser.get<bool>("opencl")) |
||||
{ |
||||
net.setPreferableTarget(DNN_TARGET_OPENCL); |
||||
} |
||||
|
||||
VideoCapture cap; |
||||
if (parser.get<String>("video").empty()) |
||||
{ |
||||
int cameraDevice = parser.get<int>("camera_device"); |
||||
cap = VideoCapture(cameraDevice); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't find camera: " << cameraDevice << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cap.open(parser.get<String>("video")); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
for(;;) |
||||
{ |
||||
Mat frame; |
||||
cap >> frame; // get a new frame from camera/video or read image
|
||||
|
||||
if (frame.empty()) |
||||
{ |
||||
waitKey(); |
||||
break; |
||||
} |
||||
|
||||
if (frame.channels() == 4) |
||||
cvtColor(frame, frame, COLOR_BGRA2BGR); |
||||
|
||||
//! [Prepare blob]
|
||||
Mat inputBlob = blobFromImage(frame, inScaleFactor, |
||||
Size(inWidth, inHeight), meanVal, false, false); //Convert Mat to batch of images
|
||||
//! [Prepare blob]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(inputBlob, "data"); //set the network input
|
||||
//! [Set input blob]
|
||||
|
||||
//! [Make forward pass]
|
||||
Mat detection = net.forward("detection_out"); //compute output
|
||||
//! [Make forward pass]
|
||||
|
||||
vector<double> layersTimings; |
||||
double freq = getTickFrequency() / 1000; |
||||
double time = net.getPerfProfile(layersTimings) / freq; |
||||
|
||||
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>()); |
||||
|
||||
ostringstream ss; |
||||
ss << "FPS: " << 1000/time << " ; time: " << time << " ms"; |
||||
putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255)); |
||||
|
||||
float confidenceThreshold = parser.get<float>("min_confidence"); |
||||
for(int i = 0; i < detectionMat.rows; i++) |
||||
{ |
||||
float confidence = detectionMat.at<float>(i, 2); |
||||
|
||||
if(confidence > confidenceThreshold) |
||||
{ |
||||
int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols); |
||||
int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows); |
||||
int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols); |
||||
int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows); |
||||
|
||||
Rect object((int)xLeftBottom, (int)yLeftBottom, |
||||
(int)(xRightTop - xLeftBottom), |
||||
(int)(yRightTop - yLeftBottom)); |
||||
|
||||
rectangle(frame, object, Scalar(0, 255, 0)); |
||||
|
||||
ss.str(""); |
||||
ss << confidence; |
||||
String conf(ss.str()); |
||||
String label = "Face: " + conf; |
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height), |
||||
Size(labelSize.width, labelSize.height + baseLine)), |
||||
Scalar(255, 255, 255), FILLED); |
||||
putText(frame, label, Point(xLeftBottom, yLeftBottom), |
||||
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); |
||||
} |
||||
} |
||||
|
||||
imshow("detections", frame); |
||||
if (waitKey(1) >= 0) break; |
||||
} |
||||
|
||||
return 0; |
||||
} // main
|
@ -1,55 +0,0 @@ |
||||
import numpy as np |
||||
import argparse |
||||
import cv2 as cv |
||||
try: |
||||
import cv2 as cv |
||||
except ImportError: |
||||
raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, ' |
||||
'configure environment variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)') |
||||
|
||||
from cv2 import dnn |
||||
|
||||
inWidth = 300 |
||||
inHeight = 300 |
||||
confThreshold = 0.5 |
||||
|
||||
prototxt = 'face_detector/deploy.prototxt' |
||||
caffemodel = 'face_detector/res10_300x300_ssd_iter_140000.caffemodel' |
||||
|
||||
if __name__ == '__main__': |
||||
net = dnn.readNetFromCaffe(prototxt, caffemodel) |
||||
cap = cv.VideoCapture(0) |
||||
while True: |
||||
ret, frame = cap.read() |
||||
cols = frame.shape[1] |
||||
rows = frame.shape[0] |
||||
|
||||
net.setInput(dnn.blobFromImage(frame, 1.0, (inWidth, inHeight), (104.0, 177.0, 123.0), False, False)) |
||||
detections = net.forward() |
||||
|
||||
perf_stats = net.getPerfProfile() |
||||
|
||||
print('Inference time, ms: %.2f' % (perf_stats[0] / cv.getTickFrequency() * 1000)) |
||||
|
||||
for i in range(detections.shape[2]): |
||||
confidence = detections[0, 0, i, 2] |
||||
if confidence > confThreshold: |
||||
xLeftBottom = int(detections[0, 0, i, 3] * cols) |
||||
yLeftBottom = int(detections[0, 0, i, 4] * rows) |
||||
xRightTop = int(detections[0, 0, i, 5] * cols) |
||||
yRightTop = int(detections[0, 0, i, 6] * rows) |
||||
|
||||
cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop), |
||||
(0, 255, 0)) |
||||
label = "face: %.4f" % confidence |
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) |
||||
|
||||
cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]), |
||||
(xLeftBottom + labelSize[0], yLeftBottom + baseLine), |
||||
(255, 255, 255), cv.FILLED) |
||||
cv.putText(frame, label, (xLeftBottom, yLeftBottom), |
||||
cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) |
||||
|
||||
cv.imshow("detections", frame) |
||||
if cv.waitKey(1) != -1: |
||||
break |
@ -1,187 +0,0 @@ |
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <iostream> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
using namespace cv::dnn; |
||||
|
||||
const size_t inWidth = 300; |
||||
const size_t inHeight = 300; |
||||
const float inScaleFactor = 0.007843f; |
||||
const float meanVal = 127.5; |
||||
const char* classNames[] = {"background", |
||||
"aeroplane", "bicycle", "bird", "boat", |
||||
"bottle", "bus", "car", "cat", "chair", |
||||
"cow", "diningtable", "dog", "horse", |
||||
"motorbike", "person", "pottedplant", |
||||
"sheep", "sofa", "train", "tvmonitor"}; |
||||
|
||||
const String keys |
||||
= "{ help | false | print usage }" |
||||
"{ proto | MobileNetSSD_deploy.prototxt | model configuration }" |
||||
"{ model | MobileNetSSD_deploy.caffemodel | model weights }" |
||||
"{ camera_device | 0 | camera device number }" |
||||
"{ camera_width | 640 | camera device width }" |
||||
"{ camera_height | 480 | camera device height }" |
||||
"{ video | | video or image for detection}" |
||||
"{ out | | path to output video file}" |
||||
"{ min_confidence | 0.2 | min confidence }" |
||||
"{ opencl | false | enable OpenCL }" |
||||
; |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
CommandLineParser parser(argc, argv, keys); |
||||
parser.about("This sample uses MobileNet Single-Shot Detector " |
||||
"(https://arxiv.org/abs/1704.04861) " |
||||
"to detect objects on camera/video/image.\n" |
||||
".caffemodel model's file is available here: " |
||||
"https://github.com/chuanqi305/MobileNet-SSD\n" |
||||
"Default network is 300x300 and 20-classes VOC.\n"); |
||||
|
||||
if (parser.get<bool>("help")) |
||||
{ |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
String modelConfiguration = parser.get<String>("proto"); |
||||
String modelBinary = parser.get<String>("model"); |
||||
CV_Assert(!modelConfiguration.empty() && !modelBinary.empty()); |
||||
|
||||
//! [Initialize network]
|
||||
dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary); |
||||
//! [Initialize network]
|
||||
|
||||
if (parser.get<bool>("opencl")) |
||||
{ |
||||
net.setPreferableTarget(DNN_TARGET_OPENCL); |
||||
} |
||||
|
||||
if (net.empty()) |
||||
{ |
||||
cerr << "Can't load network by using the following files: " << endl; |
||||
cerr << "prototxt: " << modelConfiguration << endl; |
||||
cerr << "caffemodel: " << modelBinary << endl; |
||||
cerr << "Models can be downloaded here:" << endl; |
||||
cerr << "https://github.com/chuanqi305/MobileNet-SSD" << endl; |
||||
exit(-1); |
||||
} |
||||
|
||||
VideoCapture cap; |
||||
if (!parser.has("video")) |
||||
{ |
||||
int cameraDevice = parser.get<int>("camera_device"); |
||||
cap = VideoCapture(cameraDevice); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't find camera: " << cameraDevice << endl; |
||||
return -1; |
||||
} |
||||
|
||||
cap.set(CAP_PROP_FRAME_WIDTH, parser.get<int>("camera_width")); |
||||
cap.set(CAP_PROP_FRAME_HEIGHT, parser.get<int>("camera_height")); |
||||
} |
||||
else |
||||
{ |
||||
cap.open(parser.get<String>("video")); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
//Acquire input size
|
||||
Size inVideoSize((int) cap.get(CAP_PROP_FRAME_WIDTH), |
||||
(int) cap.get(CAP_PROP_FRAME_HEIGHT)); |
||||
|
||||
double fps = cap.get(CAP_PROP_FPS); |
||||
int fourcc = static_cast<int>(cap.get(CAP_PROP_FOURCC)); |
||||
VideoWriter outputVideo; |
||||
outputVideo.open(parser.get<String>("out") , |
||||
(fourcc != 0 ? fourcc : VideoWriter::fourcc('M','J','P','G')), |
||||
(fps != 0 ? fps : 10.0), inVideoSize, true); |
||||
|
||||
for(;;) |
||||
{ |
||||
Mat frame; |
||||
cap >> frame; // get a new frame from camera/video or read image
|
||||
|
||||
if (frame.empty()) |
||||
{ |
||||
waitKey(); |
||||
break; |
||||
} |
||||
|
||||
if (frame.channels() == 4) |
||||
cvtColor(frame, frame, COLOR_BGRA2BGR); |
||||
|
||||
//! [Prepare blob]
|
||||
Mat inputBlob = blobFromImage(frame, inScaleFactor, |
||||
Size(inWidth, inHeight), |
||||
Scalar(meanVal, meanVal, meanVal), |
||||
false, false); //Convert Mat to batch of images
|
||||
//! [Prepare blob]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(inputBlob); //set the network input
|
||||
//! [Set input blob]
|
||||
|
||||
//! [Make forward pass]
|
||||
Mat detection = net.forward(); //compute output
|
||||
//! [Make forward pass]
|
||||
|
||||
vector<double> layersTimings; |
||||
double freq = getTickFrequency() / 1000; |
||||
double time = net.getPerfProfile(layersTimings) / freq; |
||||
|
||||
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>()); |
||||
|
||||
if (!outputVideo.isOpened()) |
||||
{ |
||||
putText(frame, format("FPS: %.2f ; time: %.2f ms", 1000.f/time, time), |
||||
Point(20,20), 0, 0.5, Scalar(0,0,255)); |
||||
} |
||||
else |
||||
cout << "Inference time, ms: " << time << endl; |
||||
|
||||
float confidenceThreshold = parser.get<float>("min_confidence"); |
||||
for(int i = 0; i < detectionMat.rows; i++) |
||||
{ |
||||
float confidence = detectionMat.at<float>(i, 2); |
||||
|
||||
if(confidence > confidenceThreshold) |
||||
{ |
||||
size_t objectClass = (size_t)(detectionMat.at<float>(i, 1)); |
||||
|
||||
int left = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols); |
||||
int top = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows); |
||||
int right = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols); |
||||
int bottom = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows); |
||||
|
||||
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0)); |
||||
String label = format("%s: %.2f", classNames[objectClass], confidence); |
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
top = max(top, labelSize.height); |
||||
rectangle(frame, Point(left, top - labelSize.height), |
||||
Point(left + labelSize.width, top + baseLine), |
||||
Scalar(255, 255, 255), FILLED); |
||||
putText(frame, label, Point(left, top), |
||||
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); |
||||
} |
||||
} |
||||
|
||||
if (outputVideo.isOpened()) |
||||
outputVideo << frame; |
||||
|
||||
imshow("detections", frame); |
||||
if (waitKey(1) >= 0) break; |
||||
} |
||||
|
||||
return 0; |
||||
} // main
|
@ -1,156 +0,0 @@ |
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <iostream> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
using namespace cv::dnn; |
||||
|
||||
const char* classNames[] = {"background", |
||||
"aeroplane", "bicycle", "bird", "boat", |
||||
"bottle", "bus", "car", "cat", "chair", |
||||
"cow", "diningtable", "dog", "horse", |
||||
"motorbike", "person", "pottedplant", |
||||
"sheep", "sofa", "train", "tvmonitor"}; |
||||
|
||||
const char* about = "This sample uses Single-Shot Detector " |
||||
"(https://arxiv.org/abs/1512.02325) " |
||||
"to detect objects on camera/video/image.\n" |
||||
".caffemodel model's file is available here: " |
||||
"https://github.com/weiliu89/caffe/tree/ssd#models\n" |
||||
"Default network is 300x300 and 20-classes VOC.\n"; |
||||
|
||||
const char* params |
||||
= "{ help | false | print usage }" |
||||
"{ proto | | model configuration }" |
||||
"{ model | | model weights }" |
||||
"{ camera_device | 0 | camera device number}" |
||||
"{ video | | video or image for detection}" |
||||
"{ min_confidence | 0.5 | min confidence }"; |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
cv::CommandLineParser parser(argc, argv, params); |
||||
|
||||
if (parser.get<bool>("help")) |
||||
{ |
||||
cout << about << endl; |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
String modelConfiguration = parser.get<string>("proto"); |
||||
String modelBinary = parser.get<string>("model"); |
||||
|
||||
//! [Initialize network]
|
||||
dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary); |
||||
//! [Initialize network]
|
||||
|
||||
if (net.empty()) |
||||
{ |
||||
cerr << "Can't load network by using the following files: " << endl; |
||||
cerr << "prototxt: " << modelConfiguration << endl; |
||||
cerr << "caffemodel: " << modelBinary << endl; |
||||
cerr << "Models can be downloaded here:" << endl; |
||||
cerr << "https://github.com/weiliu89/caffe/tree/ssd#models" << endl; |
||||
exit(-1); |
||||
} |
||||
|
||||
VideoCapture cap; |
||||
if (parser.get<String>("video").empty()) |
||||
{ |
||||
int cameraDevice = parser.get<int>("camera_device"); |
||||
cap = VideoCapture(cameraDevice); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't find camera: " << cameraDevice << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cap.open(parser.get<String>("video")); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
for (;;) |
||||
{ |
||||
cv::Mat frame; |
||||
cap >> frame; // get a new frame from camera/video or read image
|
||||
|
||||
if (frame.empty()) |
||||
{ |
||||
waitKey(); |
||||
break; |
||||
} |
||||
|
||||
if (frame.channels() == 4) |
||||
cvtColor(frame, frame, COLOR_BGRA2BGR); |
||||
|
||||
//! [Prepare blob]
|
||||
Mat inputBlob = blobFromImage(frame, 1.0f, Size(300, 300), Scalar(104, 117, 123), false, false); //Convert Mat to batch of images
|
||||
//! [Prepare blob]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(inputBlob, "data"); //set the network input
|
||||
//! [Set input blob]
|
||||
|
||||
//! [Make forward pass]
|
||||
Mat detection = net.forward("detection_out"); //compute output
|
||||
//! [Make forward pass]
|
||||
|
||||
vector<double> layersTimings; |
||||
double freq = getTickFrequency() / 1000; |
||||
double time = net.getPerfProfile(layersTimings) / freq; |
||||
ostringstream ss; |
||||
ss << "FPS: " << 1000/time << " ; time: " << time << " ms"; |
||||
putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255)); |
||||
|
||||
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>()); |
||||
|
||||
float confidenceThreshold = parser.get<float>("min_confidence"); |
||||
for(int i = 0; i < detectionMat.rows; i++) |
||||
{ |
||||
float confidence = detectionMat.at<float>(i, 2); |
||||
|
||||
if(confidence > confidenceThreshold) |
||||
{ |
||||
size_t objectClass = (size_t)(detectionMat.at<float>(i, 1)); |
||||
|
||||
int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols); |
||||
int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows); |
||||
int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols); |
||||
int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows); |
||||
|
||||
ss.str(""); |
||||
ss << confidence; |
||||
String conf(ss.str()); |
||||
|
||||
Rect object(xLeftBottom, yLeftBottom, |
||||
xRightTop - xLeftBottom, |
||||
yRightTop - yLeftBottom); |
||||
|
||||
rectangle(frame, object, Scalar(0, 255, 0)); |
||||
String label = String(classNames[objectClass]) + ": " + conf; |
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height), |
||||
Size(labelSize.width, labelSize.height + baseLine)), |
||||
Scalar(255, 255, 255), FILLED); |
||||
putText(frame, label, Point(xLeftBottom, yLeftBottom), |
||||
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); |
||||
} |
||||
} |
||||
|
||||
imshow("detections", frame); |
||||
if (waitKey(1) >= 0) break; |
||||
} |
||||
|
||||
return 0; |
||||
} // main
|
@ -1,185 +0,0 @@ |
||||
// Brief Sample of using OpenCV dnn module in real time with device capture, video and image.
|
||||
// VIDEO DEMO: https://www.youtube.com/watch?v=NHtRlndE2cg
|
||||
|
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/highgui.hpp> |
||||
#include <fstream> |
||||
#include <iostream> |
||||
|
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::dnn; |
||||
|
||||
static const char* about = |
||||
"This sample uses You only look once (YOLO)-Detector (https://arxiv.org/abs/1612.08242) to detect objects on camera/video/image.\n" |
||||
"Models can be downloaded here: https://pjreddie.com/darknet/yolo/\n" |
||||
"Default network is 416x416.\n" |
||||
"Class names can be downloaded here: https://github.com/pjreddie/darknet/tree/master/data\n"; |
||||
|
||||
static const char* params = |
||||
"{ help | false | print usage }" |
||||
"{ cfg | | model configuration }" |
||||
"{ model | | model weights }" |
||||
"{ camera_device | 0 | camera device number}" |
||||
"{ source | | video or image for detection}" |
||||
"{ out | | path to output video file}" |
||||
"{ fps | 3 | frame per second }" |
||||
"{ style | box | box or line style draw }" |
||||
"{ min_confidence | 0.24 | min confidence }" |
||||
"{ class_names | | File with class names, [PATH-TO-DARKNET]/data/coco.names }"; |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
CommandLineParser parser(argc, argv, params); |
||||
|
||||
if (parser.get<bool>("help")) |
||||
{ |
||||
cout << about << endl; |
||||
parser.printMessage(); |
||||
return 0; |
||||
} |
||||
|
||||
String modelConfiguration = parser.get<String>("cfg"); |
||||
String modelBinary = parser.get<String>("model"); |
||||
|
||||
//! [Initialize network]
|
||||
dnn::Net net = readNetFromDarknet(modelConfiguration, modelBinary); |
||||
//! [Initialize network]
|
||||
|
||||
if (net.empty()) |
||||
{ |
||||
cerr << "Can't load network by using the following files: " << endl; |
||||
cerr << "cfg-file: " << modelConfiguration << endl; |
||||
cerr << "weights-file: " << modelBinary << endl; |
||||
cerr << "Models can be downloaded here:" << endl; |
||||
cerr << "https://pjreddie.com/darknet/yolo/" << endl; |
||||
exit(-1); |
||||
} |
||||
|
||||
VideoCapture cap; |
||||
VideoWriter writer; |
||||
int codec = CV_FOURCC('M', 'J', 'P', 'G'); |
||||
double fps = parser.get<float>("fps"); |
||||
if (parser.get<String>("source").empty()) |
||||
{ |
||||
int cameraDevice = parser.get<int>("camera_device"); |
||||
cap = VideoCapture(cameraDevice); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't find camera: " << cameraDevice << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cap.open(parser.get<String>("source")); |
||||
if(!cap.isOpened()) |
||||
{ |
||||
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
if(!parser.get<String>("out").empty()) |
||||
{ |
||||
writer.open(parser.get<String>("out"), codec, fps, Size((int)cap.get(CAP_PROP_FRAME_WIDTH),(int)cap.get(CAP_PROP_FRAME_HEIGHT)), 1); |
||||
} |
||||
|
||||
vector<String> classNamesVec; |
||||
ifstream classNamesFile(parser.get<String>("class_names").c_str()); |
||||
if (classNamesFile.is_open()) |
||||
{ |
||||
string className = ""; |
||||
while (std::getline(classNamesFile, className)) |
||||
classNamesVec.push_back(className); |
||||
} |
||||
|
||||
String object_roi_style = parser.get<String>("style"); |
||||
|
||||
for(;;) |
||||
{ |
||||
Mat frame; |
||||
cap >> frame; // get a new frame from camera/video or read image
|
||||
|
||||
if (frame.empty()) |
||||
{ |
||||
waitKey(); |
||||
break; |
||||
} |
||||
|
||||
if (frame.channels() == 4) |
||||
cvtColor(frame, frame, COLOR_BGRA2BGR); |
||||
|
||||
//! [Prepare blob]
|
||||
Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false); //Convert Mat to batch of images
|
||||
//! [Prepare blob]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(inputBlob, "data"); //set the network input
|
||||
//! [Set input blob]
|
||||
|
||||
//! [Make forward pass]
|
||||
Mat detectionMat = net.forward("detection_out"); //compute output
|
||||
//! [Make forward pass]
|
||||
|
||||
vector<double> layersTimings; |
||||
double tick_freq = getTickFrequency(); |
||||
double time_ms = net.getPerfProfile(layersTimings) / tick_freq * 1000; |
||||
putText(frame, format("FPS: %.2f ; time: %.2f ms", 1000.f / time_ms, time_ms), |
||||
Point(20, 20), 0, 0.5, Scalar(0, 0, 255)); |
||||
|
||||
float confidenceThreshold = parser.get<float>("min_confidence"); |
||||
for (int i = 0; i < detectionMat.rows; i++) |
||||
{ |
||||
const int probability_index = 5; |
||||
const int probability_size = detectionMat.cols - probability_index; |
||||
float *prob_array_ptr = &detectionMat.at<float>(i, probability_index); |
||||
|
||||
size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; |
||||
float confidence = detectionMat.at<float>(i, (int)objectClass + probability_index); |
||||
|
||||
if (confidence > confidenceThreshold) |
||||
{ |
||||
float x_center = detectionMat.at<float>(i, 0) * frame.cols; |
||||
float y_center = detectionMat.at<float>(i, 1) * frame.rows; |
||||
float width = detectionMat.at<float>(i, 2) * frame.cols; |
||||
float height = detectionMat.at<float>(i, 3) * frame.rows; |
||||
Point p1(cvRound(x_center - width / 2), cvRound(y_center - height / 2)); |
||||
Point p2(cvRound(x_center + width / 2), cvRound(y_center + height / 2)); |
||||
Rect object(p1, p2); |
||||
|
||||
Scalar object_roi_color(0, 255, 0); |
||||
|
||||
if (object_roi_style == "box") |
||||
{ |
||||
rectangle(frame, object, object_roi_color); |
||||
} |
||||
else |
||||
{ |
||||
Point p_center(cvRound(x_center), cvRound(y_center)); |
||||
line(frame, object.tl(), p_center, object_roi_color, 1); |
||||
} |
||||
|
||||
String className = objectClass < classNamesVec.size() ? classNamesVec[objectClass] : cv::format("unknown(%d)", objectClass); |
||||
String label = format("%s: %.2f", className.c_str(), confidence); |
||||
int baseLine = 0; |
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); |
||||
rectangle(frame, Rect(p1, Size(labelSize.width, labelSize.height + baseLine)), |
||||
object_roi_color, FILLED); |
||||
putText(frame, label, p1 + Point(0, labelSize.height), |
||||
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); |
||||
} |
||||
} |
||||
if(writer.isOpened()) |
||||
{ |
||||
writer.write(frame); |
||||
} |
||||
|
||||
imshow("YOLO: Detections", frame); |
||||
if (waitKey(1) >= 0) break; |
||||
} |
||||
|
||||
return 0; |
||||
} // main
|
Loading…
Reference in new issue