From 35eba9ca9096f9fd69a80acfeabd07b508608019 Mon Sep 17 00:00:00 2001 From: Gursimar Singh Date: Tue, 6 Aug 2024 11:46:11 +0530 Subject: [PATCH] Merge pull request #25519 from gursimarsingh:improved_classification_sample Improved classification sample #25519 #25006 #25314 This pull requests replaces the caffe model for classification with onnx versions. It also adds resnet in model.yml. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../dnn/dnn_googlenet/dnn_googlenet.markdown | 21 +- modules/imgproc/src/drawing_text.cpp | 7 +- samples/dnn/classification.cpp | 308 ++++++++++-------- samples/dnn/classification.py | 175 ++++++---- samples/dnn/common.hpp | 91 +++++- samples/dnn/common.py | 83 ++++- samples/dnn/models.yml | 48 ++- 7 files changed, 493 insertions(+), 240 deletions(-) diff --git a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown index 4164d6165f..a89cd16944 100644 --- a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown +++ b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown @@ -1,4 +1,4 @@ -Load Caffe framework models {#tutorial_dnn_googlenet} +Load ONNX framework models {#tutorial_dnn_googlenet} =========================== @tableofcontents @@ -8,13 +8,13 @@ Load Caffe framework models {#tutorial_dnn_googlenet} | | | | -: | :- | | Original author | Vitaliy Lyudvichenko | -| Compatibility | OpenCV >= 3.3 | +| Compatibility | OpenCV >= 4.5.4 | Introduction ------------ In this tutorial you will learn how to use opencv_dnn module for image classification by using -GoogLeNet trained network from [Caffe model zoo](http://caffe.berkeleyvision.org/model_zoo.html). +GoogLeNet trained network from [ONNX model zoo](https://github.com/onnx/models/). We will demonstrate results of this example on the following picture. ![Buran space shuttle](dnn/images/space_shuttle.jpg) @@ -30,21 +30,18 @@ Explanation ----------- -# Firstly, download GoogLeNet model files: - [bvlc_googlenet.prototxt ](https://github.com/opencv/opencv_extra/blob/5.x/testdata/dnn/bvlc_googlenet.prototxt) and - [bvlc_googlenet.caffemodel](http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel) + @code + python download_models.py googlenet + @endcode Also you need file with names of [ILSVRC2012](http://image-net.org/challenges/LSVRC/2012/browse-synsets) classes: [classification_classes_ILSVRC2012.txt](https://github.com/opencv/opencv/blob/5.x/samples/data/dnn/classification_classes_ILSVRC2012.txt). Put these files into working dir of this program example. --# Read and initialize network using path to .prototxt and .caffemodel files +-# Read and initialize network using path to .onnx file @snippet dnn/classification.cpp Read and initialize network - You can skip an argument `framework` if one of the files `model` or `config` has an - extension `.caffemodel` or `.prototxt`. - This way function cv::dnn::readNet can automatically detects a model's format. - -# Read input image and convert to the blob, acceptable by GoogleNet @snippet dnn/classification.cpp Open a video file or an image file or a camera stream @@ -53,7 +50,7 @@ Explanation @snippet dnn/classification.cpp Create a 4D blob from a frame We convert the image to a 4-dimensional blob (so-called batch) with `1x3x224x224` shape after applying necessary pre-processing like resizing and mean subtraction - `(-104, -117, -123)` for each blue, green and red channels correspondingly using cv::dnn::blobFromImage function. + for each blue, green and red channels correspondingly using cv::dnn::blobFromImage function. -# Pass the blob to the network @snippet dnn/classification.cpp Set input blob @@ -69,6 +66,6 @@ Explanation -# Run an example from command line @code - ./example_dnn_classification --model=bvlc_googlenet.caffemodel --config=bvlc_googlenet.prototxt --width=224 --height=224 --classes=classification_classes_ILSVRC2012.txt --input=space_shuttle.jpg --mean="104 117 123" + ./example_dnn_classification googlenet @endcode For our image we get prediction of class `space shuttle` with more than 99% sureness. diff --git a/modules/imgproc/src/drawing_text.cpp b/modules/imgproc/src/drawing_text.cpp index 8202ed63f3..54f02aa468 100644 --- a/modules/imgproc/src/drawing_text.cpp +++ b/modules/imgproc/src/drawing_text.cpp @@ -1404,10 +1404,11 @@ Point FontRenderEngine::putText_( if (weight != 0) for(j = 0; j <= BUILTIN_FONTS_NUM; j++) { - int params[] = {STBTT_FOURCC('w', 'g', 'h', 't'), saved_weights[j]}; font_t* ttface = (j < BUILTIN_FONTS_NUM ? builtin_ffaces[j] : fontface)->ttface; - if (ttface) - stbtt_SetInstance(ttface, params, 1, 0); + if (!ttface || stbtt_GetWeight(ttface) == saved_weights[j]) + continue; + int params[] = {STBTT_FOURCC('w', 'g', 'h', 't'), saved_weights[j]}; + stbtt_SetInstance(ttface, params, 1, 0); } return pen; diff --git a/samples/dnn/classification.cpp b/samples/dnn/classification.cpp index 47b78606b1..6c074ee23a 100644 --- a/samples/dnn/classification.cpp +++ b/samples/dnn/classification.cpp @@ -8,62 +8,109 @@ #include "common.hpp" -std::string param_keys = - "{ help h | | Print help message. }" - "{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }" - "{ zoo | models.yml | An optional path to file with preprocessing parameters }" - "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" - "{ initial_width | 0 | Preprocess input image by initial resizing to a specific width.}" - "{ initial_height | 0 | Preprocess input image by initial resizing to a specific height.}" - "{ std | 0.0 0.0 0.0 | Preprocess input image by dividing on a standard deviation.}" - "{ crop | false | Preprocess input image by center cropping.}" - "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }" - "{ needSoftmax | false | Use Softmax to post-process the output of the net.}" - "{ classes | | Optional path to a text file with names of classes. }"; -std::string backend_keys = cv::format( - "{ backend | 0 | Choose one of computation backends: " - "%d: automatically (by default), " - "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " - "%d: OpenCV implementation, " - "%d: VKCOM, " - "%d: CUDA, " - "%d: WebNN }", cv::dnn::DNN_BACKEND_DEFAULT, cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_BACKEND_VKCOM, cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_BACKEND_WEBNN); -std::string target_keys = cv::format( - "{ target | 0 | Choose one of target computation devices: " - "%d: CPU target (by default), " - "%d: OpenCL, " - "%d: OpenCL fp16 (half-float precision), " - "%d: VPU, " - "%d: Vulkan, " - "%d: CUDA, " - "%d: CUDA fp16 (half-float preprocess) }", cv::dnn::DNN_TARGET_CPU, cv::dnn::DNN_TARGET_OPENCL, cv::dnn::DNN_TARGET_OPENCL_FP16, cv::dnn::DNN_TARGET_MYRIAD, cv::dnn::DNN_TARGET_VULKAN, cv::dnn::DNN_TARGET_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16); - -std::string keys = param_keys + backend_keys + target_keys; - using namespace cv; +using namespace std; using namespace dnn; -std::vector classes; +const string about = + "Use this script to run a classification model on a camera stream, video, image or image list (i.e. .xml or .yaml containing image lists)\n\n" + "Firstly, download required models using `download_models.py` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.\n" + "To run:\n" + "\t ./example_dnn_classification model_name --input=path/to/your/input/image/or/video (don't give --input flag if want to use device camera)\n" + "Sample command:\n" + "\t ./example_dnn_classification resnet --input=$OPENCV_SAMPLES_DATA_PATH/baboon.jpg\n" + "\t ./example_dnn_classification squeezenet\n" + "Model path can also be specified using --model argument. " + "Use imagelist_creator to create the xml or yaml list\n"; + +const string param_keys = + "{ help h | | Print help message. }" + "{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }" + "{ zoo | ../dnn/models.yml | An optional path to file with preprocessing parameters }" + "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" + "{ imglist | | Pass this flag if image list (i.e. .xml or .yaml) file is passed}" + "{ crop | false | Preprocess input image by center cropping.}" + //"{ labels | | Path to the text file with labels for detected objects.}" + "{ model | | Path to the model file.}"; + +const string backend_keys = format( + "{ backend | default | Choose one of computation backends: " + "default: automatically (by default), " + "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "opencv: OpenCV implementation, " + "vkcom: VKCOM, " + "cuda: CUDA, " + "webnn: WebNN }"); + +const string target_keys = format( + "{ target | cpu | Choose one of target computation devices: " + "cpu: CPU target (by default), " + "opencl: OpenCL, " + "opencl_fp16: OpenCL fp16 (half-float precision), " + "vpu: VPU, " + "vulkan: Vulkan, " + "cuda: CUDA, " + "cuda_fp16: CUDA fp16 (half-float preprocess) }"); + +string keys = param_keys + backend_keys + target_keys; + +vector classes; +static bool readStringList( const string& filename, vector& l ) +{ + l.resize(0); + FileStorage fs(filename, FileStorage::READ); + if( !fs.isOpened() ) + return false; + size_t dir_pos = filename.rfind('/'); + if (dir_pos == string::npos) + dir_pos = filename.rfind('\\'); + FileNode n = fs.getFirstTopLevelNode(); + if( n.type() != FileNode::SEQ ) + return false; + FileNodeIterator it = n.begin(), it_end = n.end(); + for( ; it != it_end; ++it ) + { + string fname = (string)*it; + if (dir_pos != string::npos) + { + string fpath = samples::findFile(filename.substr(0, dir_pos + 1) + fname, false); + if (fpath.empty()) + { + fpath = samples::findFile(fname); + } + fname = fpath; + } + else + { + fname = samples::findFile(fname); + } + l.push_back(fname); + } + return true; +} int main(int argc, char** argv) { CommandLineParser parser(argc, argv, keys); - const std::string modelName = parser.get("@alias"); - const std::string zooFile = parser.get("zoo"); + if (!parser.has("@alias") || parser.has("help")) + { + cout << about << endl; + parser.printMessage(); + return -1; + } + const string modelName = parser.get("@alias"); + const string zooFile = findFile(parser.get("zoo")); keys += genPreprocArguments(modelName, zooFile); - parser = CommandLineParser(argc, argv, keys); - parser.about("Use this script to run classification deep learning networks using OpenCV."); + parser.about(about); if (argc == 1 || parser.has("help")) { parser.printMessage(); return 0; } - - int rszWidth = parser.get("initial_width"); - int rszHeight = parser.get("initial_height"); + String sha1 = parser.get("sha1"); float scale = parser.get("scale"); Scalar mean = parser.get("mean"); Scalar std = parser.get("std"); @@ -71,73 +118,94 @@ int main(int argc, char** argv) bool crop = parser.get("crop"); int inpWidth = parser.get("width"); int inpHeight = parser.get("height"); - String model = findFile(parser.get("model")); - String config = findFile(parser.get("config")); - String framework = parser.get("framework"); - int backendId = parser.get("backend"); - int targetId = parser.get("target"); - bool needSoftmax = parser.get("needSoftmax"); - std::cout<<"mean: "<("model"), sha1); + String backend = parser.get("backend"); + String target = parser.get("target"); + bool isImgList = parser.has("imglist"); + + // Open file with labels. + string labels_filename = parser.get("labels"); + string file = findFile(labels_filename); + ifstream ifs(file.c_str()); + if (!ifs.is_open()){ + cout<<"File " << file << " not found"; + exit(1); + } + string line; + while (getline(ifs, line)) { - std::string file = parser.get("classes"); - std::ifstream ifs(file.c_str()); - if (!ifs.is_open()) - CV_Error(Error::StsError, "File " + file + " not found"); - std::string line; - while (std::getline(ifs, line)) - { - classes.push_back(line); - } + classes.push_back(line); } - if (!parser.check()) { parser.printErrors(); return 1; } CV_Assert(!model.empty()); - //! [Read and initialize network] - Net net = readNet(model, config, framework); - net.setPreferableBackend(backendId); - net.setPreferableTarget(targetId); + Net net = readNetFromONNX(model); + net.setPreferableBackend(getBackendID(backend)); + net.setPreferableTarget(getTargetID(target)); //! [Read and initialize network] // Create a window static const std::string kWinName = "Deep learning image classification in OpenCV"; namedWindow(kWinName, WINDOW_NORMAL); + //Create FontFace for putText + FontFace sans("sans"); + //! [Open a video file or an image file or a camera stream] VideoCapture cap; - if (parser.has("input")) - cap.open(parser.get("input")); - else - cap.open(0); + vector imageList; + size_t currentImageIndex = 0; + + if (parser.has("input")) { + string input = findFile(parser.get("input")); + + if (isImgList) { + bool check = readStringList(samples::findFile(input), imageList); + if (imageList.empty() || !check) { + cout << "Error: No images found or the provided file is not a valid .yaml or .xml file." << endl; + return -1; + } + } else { + // Input is not a directory, try to open as video or image + cap.open(input); + if (!cap.isOpened()) { + cout << "Failed to open the input." << endl; + return -1; + } + } + } else { + cap.open(0); // Open default camera + } //! [Open a video file or an image file or a camera stream] - // Process frames. Mat frame, blob; - while (waitKey(1) < 0) + for(;;) { - cap >> frame; + if (!imageList.empty()) { + // Handling directory of images + if (currentImageIndex >= imageList.size()) { + waitKey(); + break; // Exit if all images are processed + } + frame = imread(imageList[currentImageIndex++]); + if(frame.empty()){ + cout<<"Cannot open file"<> frame; + } if (frame.empty()) { - waitKey(); break; } - - if (rszWidth != 0 && rszHeight != 0) - { - resize(frame, frame, Size(rszWidth, rszHeight)); - } - //! [Create a 4D blob from a frame] blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop); - // Check std values. if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0) { @@ -145,69 +213,51 @@ int main(int argc, char** argv) divide(blob, std, blob); } //! [Create a 4D blob from a frame] - //! [Set input blob] net.setInput(blob); //! [Set input blob] - //! [Make forward pass] - // double t_sum = 0.0; - // double t; - int classId; - double confidence; - cv::TickMeter timeRecorder; + + TickMeter timeRecorder; timeRecorder.reset(); Mat prob = net.forward(); double t1; + //! [Make forward pass] timeRecorder.start(); prob = net.forward(); timeRecorder.stop(); - t1 = timeRecorder.getTimeMilli(); + //! [Make forward pass] - timeRecorder.reset(); - for(int i = 0; i < 200; i++) { - //! [Make forward pass] - timeRecorder.start(); - prob = net.forward(); - timeRecorder.stop(); - - //! [Get a class with a highest score] - Point classIdPoint; - minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint); - classId = classIdPoint.x; - //! [Get a class with a highest score] - - // Put efficiency information. - // std::vector layersTimes; - // double freq = getTickFrequency() / 1000; - // t = net.getPerfProfile(layersTimes) / freq; - // t_sum += t; - } - if (needSoftmax == true) - { - float maxProb = 0.0; - float sum = 0.0; - Mat softmaxProb; - - maxProb = *std::max_element(prob.begin(), prob.end()); - cv::exp(prob-maxProb, softmaxProb); - sum = (float)cv::sum(softmaxProb)[0]; - softmaxProb /= sum; - Point classIdPoint; - minMaxLoc(softmaxProb.reshape(1, 1), 0, &confidence, 0, &classIdPoint); - classId = classIdPoint.x; + //! [Get a class with a highest score] + int N = (int)prob.total(), K = std::min(5, N); + std::vector > prob_vec; + for (int i = 0; i < N; i++) { + prob_vec.push_back(std::make_pair(-prob.at(i), i)); } - std::string label = format("Inference time of 1 round: %.2f ms", t1); - std::string label2 = format("Average time of 200 rounds: %.2f ms", timeRecorder.getTimeMilli()/200); - putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); - putText(frame, label2, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); + std::sort(prob_vec.begin(), prob_vec.end()); - // Print predicted class. - label = format("%s: %.4f", (classes.empty() ? format("Class #%d", classId).c_str() : - classes[classId].c_str()), - confidence); - putText(frame, label, Point(0, 55), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); + //! [Get a class with a highest score] + t1 = timeRecorder.getTimeMilli(); + timeRecorder.reset(); + string label = format("Inference time: %.1f ms", t1); + Mat subframe = frame(Rect(0, 0, std::min(1000, frame.cols), std::min(300, frame.rows))); + subframe *= 0.3f; + putText(frame, label, Point(20, 50), Scalar(0, 255, 0), sans, 25, 800); + // Print predicted class. + for (int i = 0; i < K; i++) { + int classId = prob_vec[i].second; + float confidence = -prob_vec[i].first; + label = format("%d. %s: %.2f", i+1, (classes.empty() ? format("Class #%d", classId).c_str() : + classes[classId].c_str()), confidence); + putText(frame, label, Point(20, 110 + i*35), Scalar(0, 255, 0), sans, 25, 500); + } imshow(kWinName, frame); + int key = waitKey(isImgList ? 1000 : 100); + if (key == ' ') + key = waitKey(); + if (key == 'q' || key == 27) // Check if 'q' or 'ESC' is pressed + return 0; } + waitKey(); return 0; } diff --git a/samples/dnn/classification.py b/samples/dnn/classification.py index bbeacc5031..ea0b43fc75 100644 --- a/samples/dnn/classification.py +++ b/samples/dnn/classification.py @@ -1,49 +1,55 @@ +import os +import glob import argparse - import cv2 as cv import numpy as np +import sys from common import * +def help(): + print( + ''' + Firstly, download required models using `download_models.py` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.\n"\n + + To run: + python classification.py model_name --input=path/to/your/input/image/or/video (don't give --input flag if want to use device camera) + + Sample command: + python classification.py googlenet --input=path/to/image + Model path can also be specified using --model argument + ''' + ) def get_args_parser(func_args): - backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, - cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA) - targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, - cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16) + backends = ("default", "openvino", "opencv", "vkcom", "cuda") + targets = ("cpu", "opencl", "opencl_fp16", "ncs2_vpu", "hddl_vpu", "vulkan", "cuda", "cuda_fp16") parser = argparse.ArgumentParser(add_help=False) parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'), help='An optional path to file with preprocessing parameters.') parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.') - parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'darknet'], - help='Optional name of an origin framework of the model. ' - 'Detect it automatically if it does not set.') - parser.add_argument('--std', nargs='*', type=float, - help='Preprocess input image by dividing on a standard deviation.') parser.add_argument('--crop', type=bool, default=False, - help='Preprocess input image by dividing on a standard deviation.') - parser.add_argument('--initial_width', type=int, - help='Preprocess input image by initial resizing to a specific width.') - parser.add_argument('--initial_height', type=int, - help='Preprocess input image by initial resizing to a specific height.') - parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, - help="Choose one of computation backends: " - "%d: automatically (by default), " - "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " - "%d: OpenCV implementation, " - "%d: VKCOM, " - "%d: CUDA" % backends) - parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, - help='Choose one of target computation devices: ' - '%d: CPU target (by default), ' - '%d: OpenCL, ' - '%d: OpenCL fp16 (half-float precision), ' - '%d: NCS2 VPU, ' - '%d: HDDL VPU, ' - '%d: Vulkan, ' - '%d: CUDA, ' - '%d: CUDA fp16 (half-float preprocess)'% targets) + help='Center crop the image.') + parser.add_argument('--backend', default="default", type=str, choices=backends, + help="Choose one of computation backends: " + "default: automatically (by default), " + "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "opencv: OpenCV implementation, " + "vkcom: VKCOM, " + "cuda: CUDA, " + "webnn: WebNN") + parser.add_argument('--target', default="cpu", type=str, choices=targets, + help="Choose one of target computation devices: " + "cpu: CPU target (by default), " + "opencl: OpenCL, " + "opencl_fp16: OpenCL fp16 (half-float precision), " + "ncs2_vpu: NCS2 VPU, " + "hddl_vpu: HDDL VPU, " + "vulkan: Vulkan, " + "cuda: CUDA, " + "cuda_fp16: CUDA fp16 (half-float preprocess)") + args, _ = parser.parse_known_args() add_preproc_args(args.zoo, parser, 'classification') @@ -52,41 +58,76 @@ def get_args_parser(func_args): formatter_class=argparse.ArgumentDefaultsHelpFormatter) return parser.parse_args(func_args) +def load_images(directory): + # List all common image file extensions, feel free to add more if needed + extensions = ['jpg', 'jpeg', 'png', 'bmp', 'tif', 'tiff'] + files = [] + for extension in extensions: + files.extend(glob.glob(os.path.join(directory, f'*.{extension}'))) + return files def main(func_args=None): args = get_args_parser(func_args) - args.model = findFile(args.model) - args.config = findFile(args.config) - args.classes = findFile(args.classes) + if args.alias is None or hasattr(args, 'help'): + help() + exit(1) + + args.model = findModel(args.model, args.sha1) + args.labels = findFile(args.labels) # Load names of classes - classes = None - if args.classes: - with open(args.classes, 'rt') as f: - classes = f.read().rstrip('\n').split('\n') + labels = None + if args.labels: + with open(args.labels, 'rt') as f: + labels = f.read().rstrip('\n').split('\n') # Load a network - net = cv.dnn.readNet(args.model, args.config, args.framework) - net.setPreferableBackend(args.backend) - net.setPreferableTarget(args.target) + + net = cv.dnn.readNet(args.model) + + net.setPreferableBackend(get_backend_id(args.backend)) + net.setPreferableTarget(get_target_id(args.target)) winName = 'Deep learning image classification in OpenCV' cv.namedWindow(winName, cv.WINDOW_NORMAL) - cap = cv.VideoCapture(args.input if args.input else 0) + isdir = False + + if args.input: + input_path = args.input + + if os.path.isdir(input_path): + isdir = True + image_files = load_images(input_path) + if not image_files: + print("No images found in the directory.") + exit(-1) + current_image_index = 0 + else: + input_path = findFile(input_path) + cap = cv.VideoCapture(input_path) + if not cap.isOpened(): + print("Failed to open the input video") + exit(-1) + else: + cap = cv.VideoCapture(0) + while cv.waitKey(1) < 0: - hasFrame, frame = cap.read() - if not hasFrame: - cv.waitKey() - break + if isdir: + if current_image_index >= len(image_files): + break + frame = cv.imread(image_files[current_image_index]) + current_image_index += 1 + else: + hasFrame, frame = cap.read() + if not hasFrame: + cv.waitKey() + break # Create a 4D blob from a frame. inpWidth = args.width if args.width else frame.shape[1] inpHeight = args.height if args.height else frame.shape[0] - if args.initial_width and args.initial_height: - frame = cv.resize(frame, (args.initial_width, args.initial_height)) - blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=args.crop) if args.std: blob[0] /= np.asarray(args.std, dtype=np.float32).reshape(3, 1, 1) @@ -95,22 +136,36 @@ def main(func_args=None): net.setInput(blob) out = net.forward() - # Get a class with a highest score. - out = out.flatten() - classId = np.argmax(out) - confidence = out[classId] + (h, w, _) = frame.shape + roi_rows = min(300, h) + roi_cols = min(1000, w) + frame[:roi_rows,:roi_cols,:] >>= 1 # Put efficiency information. t, _ = net.getPerfProfile() - label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) - cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) + label = 'Inference time: %.1f ms' % (t * 1000.0 / cv.getTickFrequency()) + cv.putText(frame, label, (15, 30), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) - # Print predicted class. - label = '%s: %.4f' % (classes[classId] if classes else 'Class #%d' % classId, confidence) - cv.putText(frame, label, (0, 40), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) + # Print predicted classes. + out = out.flatten() + K = 5 + topKidx = np.argpartition(out, -K)[-K:] + for i in range(K): + classId = topKidx[i] + confidence = out[classId] + label = '%s: %.2f' % (labels[classId] if labels else 'Class #%d' % classId, confidence) + cv.putText(frame, label, (15, 90 + i*30), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) cv.imshow(winName, frame) + key = cv.waitKey(1000 if isdir else 100) + if key >= 0: + key &= 255 + if key == ord(' '): + key = cv.waitKey() & 255 + if key == ord('q') or key == 27: # Wait for 1 second on each image, press 'q' to exit + sys.exit(0) + cv.waitKey() if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/samples/dnn/common.hpp b/samples/dnn/common.hpp index 674d1052c5..43f7472e4f 100644 --- a/samples/dnn/common.hpp +++ b/samples/dnn/common.hpp @@ -1,5 +1,5 @@ #include - +#include using namespace cv; std::string genArgument(const std::string& argName, const std::string& help, @@ -10,6 +10,41 @@ std::string genPreprocArguments(const std::string& modelName, const std::string& std::string findFile(const std::string& filename); +std::string findModel(const std::string& filename, const std::string& sha1); + +inline int getBackendID(const String& backend) { + std::map backendIDs = { + {"default", cv::dnn::DNN_BACKEND_DEFAULT}, + {"openvino", cv::dnn::DNN_BACKEND_INFERENCE_ENGINE}, + {"opencv", cv::dnn::DNN_BACKEND_OPENCV}, + {"vkcom", cv::dnn::DNN_BACKEND_VKCOM}, + {"cuda", cv::dnn::DNN_BACKEND_CUDA}, + {"webnn", cv::dnn::DNN_BACKEND_WEBNN} + }; + if(backendIDs.find(backend) != backendIDs.end()){ + return backendIDs[backend]; + }else { + throw std::invalid_argument("Invalid backend name: " + backend); + } +} + +inline int getTargetID(const String& target) { + std::map targetIDs = { + {"cpu", cv::dnn::DNN_TARGET_CPU}, + {"opencl", cv::dnn::DNN_TARGET_OPENCL}, + {"opencl_fp16", cv::dnn::DNN_TARGET_OPENCL_FP16}, + {"vpu", cv::dnn::DNN_TARGET_MYRIAD}, + {"vulkan", cv::dnn::DNN_TARGET_VULKAN}, + {"cuda", cv::dnn::DNN_TARGET_CUDA}, + {"cuda_fp16", cv::dnn::DNN_TARGET_CUDA_FP16} + }; + if(targetIDs.find(target) != targetIDs.end()){ + return targetIDs[target]; + }else { + throw std::invalid_argument("Invalid target name: " + target); + } +} + std::string genArgument(const std::string& argName, const std::string& help, const std::string& modelName, const std::string& zooFile, char key, std::string defaultVal) @@ -23,6 +58,9 @@ std::string genArgument(const std::string& argName, const std::string& help, if (!node.empty()) { FileNode value = node[argName]; + if(argName == "sha1"){ + value = node["load_info"][argName]; + } if (!value.empty()) { if (value.isReal()) @@ -53,14 +91,45 @@ std::string genArgument(const std::string& argName, const std::string& help, return "{ " + argName + " " + key + " | " + defaultVal + " | " + help + " }"; } +std::string findModel(const std::string& filename, const std::string& sha1) +{ + if (filename.empty() || utils::fs::exists(filename)) + return filename; + + if(!getenv("OPENCV_DOWNLOAD_CACHE_DIR")){ + std::cout<< "[WARN] Please specify a path to model download directory in OPENCV_DOWNLOAD_CACHE_DIR environment variable"<