Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

490 lines
14 KiB

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp"
#include <algorithm>
#include <iostream>
#include <utility>
#include <iterator>
#include <opencv2/imgproc.hpp>
namespace cv {
namespace dnn {
struct Model::Impl
{
//protected:
Net net;
Size size;
Scalar mean;
double scale = 1.0;
bool swapRB = false;
bool crop = false;
Mat blob;
std::vector<String> outNames;
public:
virtual ~Impl() {}
Impl() {}
Impl(const Impl&) = delete;
Impl(Impl&&) = delete;
virtual Net& getNetwork() const { return const_cast<Net&>(net); }
virtual void setPreferableBackend(Backend backendId) { net.setPreferableBackend(backendId); }
virtual void setPreferableTarget(Target targetId) { net.setPreferableTarget(targetId); }
/*virtual*/
void initNet(const Net& network)
{
net = network;
outNames = net.getUnconnectedOutLayersNames();
std::vector<MatShape> inLayerShapes;
std::vector<MatShape> outLayerShapes;
net.getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes);
if (!inLayerShapes.empty() && inLayerShapes[0].size() == 4)
size = Size(inLayerShapes[0][3], inLayerShapes[0][2]);
else
size = Size();
}
/*virtual*/
void setInputParams(double scale_, const Size& size_, const Scalar& mean_,
bool swapRB_, bool crop_)
{
size = size_;
mean = mean_;
scale = scale_;
crop = crop_;
swapRB = swapRB_;
}
/*virtual*/
void setInputSize(const Size& size_)
{
size = size_;
}
/*virtual*/
void setInputMean(const Scalar& mean_)
{
mean = mean_;
}
/*virtual*/
void setInputScale(double scale_)
{
scale = scale_;
}
/*virtual*/
void setInputCrop(bool crop_)
{
crop = crop_;
}
/*virtual*/
void setInputSwapRB(bool swapRB_)
{
swapRB = swapRB_;
}
/*virtual*/
void processFrame(InputArray frame, OutputArrayOfArrays outs)
{
if (size.empty())
CV_Error(Error::StsBadSize, "Input size not specified");
blob = blobFromImage(frame, scale, size, mean, swapRB, crop);
net.setInput(blob);
// Faster-RCNN or R-FCN
if (net.getLayer(0)->outputNameToIndex("im_info") != -1)
{
Mat imInfo(Matx31f(size.height, size.width, 1.6f));
net.setInput(imInfo, "im_info");
}
net.forward(outs, outNames);
}
};
Model::Model()
: impl(makePtr<Impl>())
{
// nothing
}
Model::Model(const String& model, const String& config)
: Model()
{
impl->initNet(readNet(model, config));
}
Model::Model(const Net& network)
: Model()
{
impl->initNet(network);
}
Net& Model::getNetwork_() const
{
CV_DbgAssert(impl);
return impl->getNetwork();
}
Model& Model::setPreferableBackend(Backend backendId)
{
CV_DbgAssert(impl);
impl->setPreferableBackend(backendId);
return *this;
}
Model& Model::setPreferableTarget(Target targetId)
{
CV_DbgAssert(impl);
impl->setPreferableTarget(targetId);
return *this;
}
Model& Model::setInputSize(const Size& size)
{
CV_DbgAssert(impl);
impl->setInputSize(size);
return *this;
}
Model& Model::setInputMean(const Scalar& mean)
{
CV_DbgAssert(impl);
impl->setInputMean(mean);
return *this;
}
Model& Model::setInputScale(double scale)
{
CV_DbgAssert(impl);
impl->setInputScale(scale);
return *this;
}
Model& Model::setInputCrop(bool crop)
{
CV_DbgAssert(impl);
impl->setInputCrop(crop);
return *this;
}
Model& Model::setInputSwapRB(bool swapRB)
{
CV_DbgAssert(impl);
impl->setInputSwapRB(swapRB);
return *this;
}
void Model::setInputParams(double scale, const Size& size, const Scalar& mean,
bool swapRB, bool crop)
{
CV_DbgAssert(impl);
impl->setInputParams(scale, size, mean, swapRB, crop);
}
void Model::predict(InputArray frame, OutputArrayOfArrays outs) const
{
CV_DbgAssert(impl);
impl->processFrame(frame, outs);
}
ClassificationModel::ClassificationModel(const String& model, const String& config)
: Model(model, config)
{
// nothing
}
ClassificationModel::ClassificationModel(const Net& network)
: Model(network)
{
// nothing
}
std::pair<int, float> ClassificationModel::classify(InputArray frame)
{
std::vector<Mat> outs;
impl->processFrame(frame, outs);
CV_Assert(outs.size() == 1);
double conf;
cv::Point maxLoc;
minMaxLoc(outs[0].reshape(1, 1), nullptr, &conf, nullptr, &maxLoc);
return {maxLoc.x, static_cast<float>(conf)};
}
void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
{
std::tie(classId, conf) = classify(frame);
}
KeypointsModel::KeypointsModel(const String& model, const String& config)
: Model(model, config) {};
KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
{
int frameHeight = frame.rows();
int frameWidth = frame.cols();
std::vector<Mat> outs;
impl->processFrame(frame, outs);
CV_Assert(outs.size() == 1);
Mat output = outs[0];
const int nPoints = output.size[1];
std::vector<Point2f> points;
// If output is a map, extract the keypoints
if (output.dims == 4)
{
int height = output.size[2];
int width = output.size[3];
// find the position of the keypoints (ignore the background)
for (int n=0; n < nPoints - 1; n++)
{
// Probability map of corresponding keypoint
Mat probMap(height, width, CV_32F, output.ptr(0, n));
Point2f p(-1, -1);
Point maxLoc;
double prob;
minMaxLoc(probMap, NULL, &prob, NULL, &maxLoc);
if (prob > thresh)
{
p = maxLoc;
p.x *= (float)frameWidth / width;
p.y *= (float)frameHeight / height;
}
points.push_back(p);
}
}
// Otherwise the output is a vector of keypoints and we can just return it
else
{
for (int n=0; n < nPoints; n++)
{
Point2f p;
p.x = *output.ptr<float>(0, n, 0);
p.y = *output.ptr<float>(0, n, 1);
points.push_back(p);
}
}
return points;
}
SegmentationModel::SegmentationModel(const String& model, const String& config)
: Model(model, config) {};
SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
void SegmentationModel::segment(InputArray frame, OutputArray mask)
{
std::vector<Mat> outs;
impl->processFrame(frame, outs);
CV_Assert(outs.size() == 1);
Mat score = outs[0];
const int chns = score.size[1];
const int rows = score.size[2];
const int cols = score.size[3];
mask.create(rows, cols, CV_8U);
Mat classIds = mask.getMat();
classIds.setTo(0);
Mat maxVal(rows, cols, CV_32F, score.data);
for (int ch = 1; ch < chns; ch++)
{
for (int row = 0; row < rows; row++)
{
const float *ptrScore = score.ptr<float>(0, ch, row);
uint8_t *ptrMaxCl = classIds.ptr<uint8_t>(row);
float *ptrMaxVal = maxVal.ptr<float>(row);
for (int col = 0; col < cols; col++)
{
if (ptrScore[col] > ptrMaxVal[col])
{
ptrMaxVal[col] = ptrScore[col];
ptrMaxCl[col] = ch;
}
}
}
}
}
void disableRegionNMS(Net& net)
{
for (String& name : net.getUnconnectedOutLayersNames())
{
int layerId = net.getLayerId(name);
Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>();
if (!layer.empty())
{
layer->nmsThreshold = 0;
}
}
}
DetectionModel::DetectionModel(const String& model, const String& config)
: Model(model, config)
{
disableRegionNMS(getNetwork_()); // FIXIT Move to DetectionModel::Impl::initNet()
}
DetectionModel::DetectionModel(const Net& network) : Model(network)
{
disableRegionNMS(getNetwork_()); // FIXIT Move to DetectionModel::Impl::initNet()
}
void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
float confThreshold, float nmsThreshold)
{
std::vector<Mat> detections;
impl->processFrame(frame, detections);
boxes.clear();
confidences.clear();
classIds.clear();
int frameWidth = frame.cols();
int frameHeight = frame.rows();
if (getNetwork_().getLayer(0)->outputNameToIndex("im_info") != -1)
{
frameWidth = impl->size.width;
frameHeight = impl->size.height;
}
std::vector<String> layerNames = getNetwork_().getLayerNames();
int lastLayerId = getNetwork_().getLayerId(layerNames.back());
Ptr<Layer> lastLayer = getNetwork_().getLayer(lastLayerId);
if (lastLayer->type == "DetectionOutput")
{
// Network produces output blob with a shape 1x1xNx7 where N is a number of
// detections and an every detection is a vector of values
// [batchId, classId, confidence, left, top, right, bottom]
for (int i = 0; i < detections.size(); ++i)
{
float* data = (float*)detections[i].data;
for (int j = 0; j < detections[i].total(); j += 7)
{
float conf = data[j + 2];
if (conf < confThreshold)
continue;
int left = data[j + 3];
int top = data[j + 4];
int right = data[j + 5];
int bottom = data[j + 6];
int width = right - left + 1;
int height = bottom - top + 1;
if (width <= 2 || height <= 2)
{
left = data[j + 3] * frameWidth;
top = data[j + 4] * frameHeight;
right = data[j + 5] * frameWidth;
bottom = data[j + 6] * frameHeight;
width = right - left + 1;
height = bottom - top + 1;
}
left = std::max(0, std::min(left, frameWidth - 1));
top = std::max(0, std::min(top, frameHeight - 1));
width = std::max(1, std::min(width, frameWidth - left));
height = std::max(1, std::min(height, frameHeight - top));
boxes.emplace_back(left, top, width, height);
classIds.push_back(static_cast<int>(data[j + 1]));
confidences.push_back(conf);
}
}
}
else if (lastLayer->type == "Region")
{
std::vector<int> predClassIds;
std::vector<Rect> predBoxes;
std::vector<float> predConf;
for (int i = 0; i < detections.size(); ++i)
{
// Network produces output blob with a shape NxC where N is a number of
// detected objects and C is a number of classes + 4 where the first 4
// numbers are [center_x, center_y, width, height]
float* data = (float*)detections[i].data;
for (int j = 0; j < detections[i].rows; ++j, data += detections[i].cols)
{
Mat scores = detections[i].row(j).colRange(5, detections[i].cols);
Point classIdPoint;
double conf;
minMaxLoc(scores, nullptr, &conf, nullptr, &classIdPoint);
if (static_cast<float>(conf) < confThreshold)
continue;
int centerX = data[0] * frameWidth;
int centerY = data[1] * frameHeight;
int width = data[2] * frameWidth;
int height = data[3] * frameHeight;
int left = std::max(0, std::min(centerX - width / 2, frameWidth - 1));
int top = std::max(0, std::min(centerY - height / 2, frameHeight - 1));
width = std::max(1, std::min(width, frameWidth - left));
height = std::max(1, std::min(height, frameHeight - top));
predClassIds.push_back(classIdPoint.x);
predConf.push_back(static_cast<float>(conf));
predBoxes.emplace_back(left, top, width, height);
}
}
if (nmsThreshold)
{
std::map<int, std::vector<size_t> > class2indices;
for (size_t i = 0; i < predClassIds.size(); i++)
{
if (predConf[i] >= confThreshold)
{
class2indices[predClassIds[i]].push_back(i);
}
}
for (const auto& it : class2indices)
{
std::vector<Rect> localBoxes;
std::vector<float> localConfidences;
for (size_t idx : it.second)
{
localBoxes.push_back(predBoxes[idx]);
localConfidences.push_back(predConf[idx]);
}
std::vector<int> indices;
NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
classIds.resize(classIds.size() + indices.size(), it.first);
for (int idx : indices)
{
boxes.push_back(localBoxes[idx]);
confidences.push_back(localConfidences[idx]);
}
}
}
else
{
boxes = std::move(predBoxes);
classIds = std::move(predClassIds);
confidences = std::move(predConf);
}
}
else
CV_Error(Error::StsNotImplemented, "Unknown output layer type: \"" + lastLayer->type + "\"");
}
}} // namespace