Merge pull request #25559 from gursimarsingh:improved_segmentation_sample

Improved segmentation sample #25559


This pull request replaces caffe models with onnx for the dnn segmentation sample in cpp and python
fcnresnet-50 and fcnresnet-101 has been replaced
u2netp (foreground-background) segmentation onnx model has been added [U2NET]( 

### Pull Request Readiness Checklist

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
Gursimar Singh 10 months ago committed by GitHub
parent 17e6b3f931
commit 48c31bddc4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 25
  2. 149
  3. 62

@ -227,14 +227,13 @@ googlenet:
# Semantic segmentation models.
url: ""
sha1: "c449ea74dd7d83751d1357d6a8c323fcf4038962"
model: "fcn8s-heavy-pascal.caffemodel"
config: "fcn8s-heavy-pascal.prototxt"
mean: [0, 0, 0]
scale: 1.0
url: ""
sha1: "1bb0c7e0034038969aecc6251166f1612a139230"
model: "fcn-resnet50-12.onnx"
mean: [103.5, 116.2, 123.6]
scale: 0.019
width: 500
height: 500
rgb: false
@ -251,3 +250,15 @@ fcnresnet101:
height: 500
rgb: false
sample: "segmentation"
url: ""
sha1: "0a99236f0d5c1916a99a8c401b23e5ef32038606"
model: "u2netp.onnx"
mean: [123.6, 116.2, 103.5]
scale: 0.019
width: 320
height: 320
rgb: true
sample: "segmentation"

@ -1,5 +1,6 @@
#include <fstream>
#include <sstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
@ -7,50 +8,54 @@
#include "common.hpp"
std::string param_keys =
"{ help h | | Print help message. }"
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
"{ device | 0 | camera device number. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
"{ classes | | Optional path to a text file with names of classes. }"
"{ colors | | Optional path to a text file with colors for an every class. "
"An every color is represented with three values from 0 to 255 in BGR channels order. }";
std::string backend_keys = cv::format(
"{ backend | 0 | Choose one of computation backends: "
"%d: automatically (by default), "
"%d: Intel's Deep Learning Inference Engine (, "
"%d: OpenCV implementation, "
"%d: VKCOM, "
std::string target_keys = cv::format(
"{ target | 0 | Choose one of target computation devices: "
"%d: CPU target (by default), "
"%d: OpenCL, "
"%d: OpenCL fp16 (half-float precision), "
"%d: VPU, "
"%d: Vulkan, "
"%d: CUDA, "
"%d: CUDA fp16 (half-float preprocess) }", cv::dnn::DNN_TARGET_CPU, cv::dnn::DNN_TARGET_OPENCL, cv::dnn::DNN_TARGET_OPENCL_FP16, cv::dnn::DNN_TARGET_MYRIAD, cv::dnn::DNN_TARGET_VULKAN, cv::dnn::DNN_TARGET_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16);
std::string keys = param_keys + backend_keys + target_keys;
using namespace cv;
using namespace std;
using namespace dnn;
std::vector<std::string> classes;
std::vector<Vec3b> colors;
const string param_keys =
"{ help h | | Print help message. }"
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
"{ device | 0 | camera device number. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
"{ classes | | Optional path to a text file with names of classes. }"
"{ colors | | Optional path to a text file with colors for an every class. "
"Every color is represented with three values from 0 to 255 in BGR channels order. }";
const string backend_keys = format(
"{ backend | 0 | Choose one of computation backends: "
"%d: automatically (by default), "
"%d: Intel's Deep Learning Inference Engine (, "
"%d: OpenCV implementation, "
"%d: VKCOM, "
"%d: CUDA }",
const string target_keys = format(
"{ target | 0 | Choose one of target computation devices: "
"%d: CPU target (by default), "
"%d: OpenCL, "
"%d: OpenCL fp16 (half-float precision), "
"%d: VPU, "
"%d: Vulkan, "
"%d: CUDA, "
"%d: CUDA fp16 (half-float preprocess) }",
string keys = param_keys + backend_keys + target_keys;
vector<string> classes;
vector<Vec3b> colors;
void showLegend();
void colorizeSegmentation(const Mat &score, Mat &segm);
int main(int argc, char** argv)
int main(int argc, char **argv)
CommandLineParser parser(argc, argv, keys);
const std::string modelName = parser.get<String>("@alias");
const std::string zooFile = parser.get<String>("zoo");
const string modelName = parser.get<String>("@alias");
const string zooFile = parser.get<String>("zoo");
keys += genPreprocArguments(modelName, zooFile);
@ -68,36 +73,33 @@ int main(int argc, char** argv)
int inpWidth = parser.get<int>("width");
int inpHeight = parser.get<int>("height");
String model = findFile(parser.get<String>("model"));
String config = findFile(parser.get<String>("config"));
String framework = parser.get<String>("framework");
int backendId = parser.get<int>("backend");
int targetId = parser.get<int>("target");
// Open file with classes names.
if (parser.has("classes"))
std::string file = parser.get<String>("classes");
std::ifstream ifs(file.c_str());
string file = parser.get<String>("classes");
ifstream ifs(file.c_str());
if (!ifs.is_open())
CV_Error(Error::StsError, "File " + file + " not found");
std::string line;
while (std::getline(ifs, line))
string line;
while (getline(ifs, line))
// Open file with colors.
if (parser.has("colors"))
std::string file = parser.get<String>("colors");
std::ifstream ifs(file.c_str());
string file = parser.get<String>("colors");
ifstream ifs(file.c_str());
if (!ifs.is_open())
CV_Error(Error::StsError, "File " + file + " not found");
std::string line;
while (std::getline(ifs, line))
string line;
while (getline(ifs, line))
std::istringstream colorStr(line.c_str());
istringstream colorStr(line.c_str());
Vec3b color;
for (int i = 0; i < 3 && !colorStr.eof(); ++i)
@ -114,23 +116,21 @@ int main(int argc, char** argv)
//! [Read and initialize network]
Net net = readNet(model, config, framework);
Net net = readNetFromONNX(model);
//! [Read and initialize network]
// Create a window
static const std::string kWinName = "Deep learning semantic segmentation in OpenCV";
static const string kWinName = "Deep learning semantic segmentation in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);
//! [Open a video file or an image file or a camera stream]
VideoCapture cap;
if (parser.has("input"))<String>("input"));<String>("input")));
//! [Open a video file or an image file or a camera stream]
// Process frames.
Mat frame, blob;
while (waitKey(1) < 0)
@ -141,29 +141,45 @@ int main(int argc, char** argv)
imshow("Original Image", frame);
//! [Create a 4D blob from a frame]
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
//! [Create a 4D blob from a frame]
//! [Set input blob]
//! [Set input blob]
//! [Make forward pass]
Mat score = net.forward();
//! [Make forward pass]
Mat segm;
colorizeSegmentation(score, segm);
resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
if (modelName == "u2netp")
Mat mask, thresholded_mask, foreground_overlay, background_overlay, foreground_segmented;
mask = cv::Mat(score.size[2], score.size[3], CV_32F, score.ptr<float>(0, 0));
mask.convertTo(mask, CV_8U, 255);
threshold(mask, thresholded_mask, 0, 255, THRESH_BINARY + THRESH_OTSU);
resize(thresholded_mask, thresholded_mask, Size(frame.cols, frame.rows), 0, 0, INTER_AREA);
// Create overlays for foreground and background
foreground_overlay = Mat::zeros(frame.size(), frame.type());
background_overlay = Mat::zeros(frame.size(), frame.type());
// Set foreground (object) to red and background to blue
foreground_overlay.setTo(Scalar(0, 0, 255), thresholded_mask);
Mat inverted_mask;
bitwise_not(thresholded_mask, inverted_mask);
background_overlay.setTo(Scalar(255, 0, 0), inverted_mask);
// Blend the overlays with the original frame
addWeighted(frame, 1, foreground_overlay, 0.5, 0, foreground_segmented);
addWeighted(foreground_segmented, 1, background_overlay, 0.5, 0, frame);
Mat segm;
colorizeSegmentation(score, segm);
resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
// Put efficiency information.
std::vector<double> layersTimes;
vector<double> layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
std::string label = format("Inference time: %.2f ms", t);
string label = format("Inference time: %.2f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
imshow(kWinName, frame);
@ -194,7 +210,8 @@ void colorizeSegmentation(const Mat &score, Mat &segm)
else if (chns != (int)colors.size())
CV_Error(Error::StsError, format("Number of output classes does not match "
"number of colors (%d != %zu)", chns, colors.size()));
"number of colors (%d != %zu)",
chns, colors.size()));
Mat maxCl = Mat::zeros(rows, cols, CV_8UC1);
@ -216,7 +233,6 @@ void colorizeSegmentation(const Mat &score, Mat &segm)
segm.create(rows, cols, CV_8UC3);
for (int row = 0; row < rows; row++)
@ -239,7 +255,8 @@ void showLegend()
if ((int)colors.size() != numClasses)
CV_Error(Error::StsError, format("Number of output classes does not match "
"number of labels (%zu != %zu)", colors.size(), classes.size()));
"number of labels (%zu != %zu)",
colors.size(), classes.size()));
legend.create(kBlockHeight * numClasses, 200, CV_8UC3);
for (int i = 0; i < numClasses; i++)

@ -14,9 +14,6 @@ parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
help='An optional path to file with preprocessing parameters.')
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'darknet', 'onnx'],
help='Optional name of an origin framework of the model. '
'Detect it automatically if it does not set.')
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
'An every color is represented with three values from 0 to 255 in BGR channels order.')
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
@ -44,7 +41,6 @@ parser = argparse.ArgumentParser(parents=[parser],
args = parser.parse_args()
args.model = findFile(args.model)
args.config = findFile(args.config)
args.classes = findFile(args.classes)
@ -79,7 +75,7 @@ def showLegend(classes):
classes = None
# Load a network
net = cv.dnn.readNet(args.model, args.config, args.framework)
net = cv.dnn.readNet(args.model)
@ -94,41 +90,53 @@ while cv.waitKey(1) < 0:
cv.imshow("Original Image", frame)
frameHeight = frame.shape[0]
frameWidth = frame.shape[1]
# Create a 4D blob from a frame.
inpWidth = args.width if args.width else frameWidth
inpHeight = args.height if args.height else frameHeight
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
# Run a model
score = net.forward()
numClasses = score.shape[1]
height = score.shape[2]
width = score.shape[3]
# Draw segmentation
if not colors:
# Generate colors
colors = [np.array([0, 0, 0], np.uint8)]
for i in range(1, numClasses):
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
classIds = np.argmax(score[0], axis=0)
segm = np.stack([colors[idx] for idx in classIds.flatten()])
segm = segm.reshape(height, width, 3)
segm = cv.resize(segm, (frameWidth, frameHeight), interpolation=cv.INTER_NEAREST)
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)
if args.alias == 'u2netp':
mask = score[0][0]
mask = mask.astype(np.uint8)
_, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
mask = cv.resize(mask, (frame.shape[1], frame.shape[0]), interpolation=cv.INTER_AREA)
# Create overlays for foreground and background
foreground_overlay = np.zeros_like(frame, dtype=np.uint8)
background_overlay = np.zeros_like(frame, dtype=np.uint8)
# Set foreground (object) to red and background to blue
foreground_overlay[mask == 255] = [0, 0, 255] # Red foreground
background_overlay[mask == 0] = [255, 0, 0] # Blue background
# Blend the overlays with the original frame
foreground_segmented = cv.addWeighted(frame, 1, foreground_overlay, 0.5, 0)
frame = cv.addWeighted(foreground_segmented, 1, background_overlay, 0.5, 0)
numClasses = score.shape[1]
height = score.shape[2]
width = score.shape[3]
# Draw segmentation
if not colors:
# Generate colors
colors = [np.array([0, 0, 0], np.uint8)]
for i in range(1, numClasses):
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
classIds = np.argmax(score[0], axis=0)
segm = np.stack([colors[idx] for idx in classIds.flatten()])
segm = segm.reshape(height, width, 3)
segm = cv.resize(segm, (frameWidth, frameHeight), interpolation=cv.INTER_NEAREST)
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)
# Put efficiency information.
t, _ = net.getPerfProfile()
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
cv.imshow(winName, frame)
cv.imshow(winName, frame)