// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "precomp.hpp" #include <algorithm> #include <iostream> #include <utility> #include <iterator> #include <opencv2/imgproc.hpp> namespace cv { namespace dnn { struct Model::Impl { //protected: Net net; Size size; Scalar mean; double scale = 1.0; bool swapRB = false; bool crop = false; Mat blob; std::vector<String> outNames; public: virtual ~Impl() {} Impl() {} Impl(const Impl&) = delete; Impl(Impl&&) = delete; virtual Net& getNetwork() const { return const_cast<Net&>(net); } virtual void setPreferableBackend(Backend backendId) { net.setPreferableBackend(backendId); } virtual void setPreferableTarget(Target targetId) { net.setPreferableTarget(targetId); } /*virtual*/ void initNet(const Net& network) { net = network; outNames = net.getUnconnectedOutLayersNames(); std::vector<MatShape> inLayerShapes; std::vector<MatShape> outLayerShapes; net.getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes); if (!inLayerShapes.empty() && inLayerShapes[0].size() == 4) size = Size(inLayerShapes[0][3], inLayerShapes[0][2]); else size = Size(); } /*virtual*/ void setInputParams(double scale_, const Size& size_, const Scalar& mean_, bool swapRB_, bool crop_) { size = size_; mean = mean_; scale = scale_; crop = crop_; swapRB = swapRB_; } /*virtual*/ void setInputSize(const Size& size_) { size = size_; } /*virtual*/ void setInputMean(const Scalar& mean_) { mean = mean_; } /*virtual*/ void setInputScale(double scale_) { scale = scale_; } /*virtual*/ void setInputCrop(bool crop_) { crop = crop_; } /*virtual*/ void setInputSwapRB(bool swapRB_) { swapRB = swapRB_; } /*virtual*/ void processFrame(InputArray frame, OutputArrayOfArrays outs) { if (size.empty()) CV_Error(Error::StsBadSize, "Input size not specified"); blob = blobFromImage(frame, scale, size, mean, swapRB, crop); net.setInput(blob); // Faster-RCNN or R-FCN if (net.getLayer(0)->outputNameToIndex("im_info") != -1) { Mat imInfo(Matx31f(size.height, size.width, 1.6f)); net.setInput(imInfo, "im_info"); } net.forward(outs, outNames); } }; Model::Model() : impl(makePtr<Impl>()) { // nothing } Model::Model(const String& model, const String& config) : Model() { impl->initNet(readNet(model, config)); } Model::Model(const Net& network) : Model() { impl->initNet(network); } Net& Model::getNetwork_() const { CV_DbgAssert(impl); return impl->getNetwork(); } Model& Model::setPreferableBackend(Backend backendId) { CV_DbgAssert(impl); impl->setPreferableBackend(backendId); return *this; } Model& Model::setPreferableTarget(Target targetId) { CV_DbgAssert(impl); impl->setPreferableTarget(targetId); return *this; } Model& Model::setInputSize(const Size& size) { CV_DbgAssert(impl); impl->setInputSize(size); return *this; } Model& Model::setInputMean(const Scalar& mean) { CV_DbgAssert(impl); impl->setInputMean(mean); return *this; } Model& Model::setInputScale(double scale) { CV_DbgAssert(impl); impl->setInputScale(scale); return *this; } Model& Model::setInputCrop(bool crop) { CV_DbgAssert(impl); impl->setInputCrop(crop); return *this; } Model& Model::setInputSwapRB(bool swapRB) { CV_DbgAssert(impl); impl->setInputSwapRB(swapRB); return *this; } void Model::setInputParams(double scale, const Size& size, const Scalar& mean, bool swapRB, bool crop) { CV_DbgAssert(impl); impl->setInputParams(scale, size, mean, swapRB, crop); } void Model::predict(InputArray frame, OutputArrayOfArrays outs) const { CV_DbgAssert(impl); impl->processFrame(frame, outs); } ClassificationModel::ClassificationModel(const String& model, const String& config) : Model(model, config) { // nothing } ClassificationModel::ClassificationModel(const Net& network) : Model(network) { // nothing } std::pair<int, float> ClassificationModel::classify(InputArray frame) { std::vector<Mat> outs; impl->processFrame(frame, outs); CV_Assert(outs.size() == 1); double conf; cv::Point maxLoc; minMaxLoc(outs[0].reshape(1, 1), nullptr, &conf, nullptr, &maxLoc); return {maxLoc.x, static_cast<float>(conf)}; } void ClassificationModel::classify(InputArray frame, int& classId, float& conf) { std::tie(classId, conf) = classify(frame); } KeypointsModel::KeypointsModel(const String& model, const String& config) : Model(model, config) {}; KeypointsModel::KeypointsModel(const Net& network) : Model(network) {}; std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh) { int frameHeight = frame.rows(); int frameWidth = frame.cols(); std::vector<Mat> outs; impl->processFrame(frame, outs); CV_Assert(outs.size() == 1); Mat output = outs[0]; const int nPoints = output.size[1]; std::vector<Point2f> points; // If output is a map, extract the keypoints if (output.dims == 4) { int height = output.size[2]; int width = output.size[3]; // find the position of the keypoints (ignore the background) for (int n=0; n < nPoints - 1; n++) { // Probability map of corresponding keypoint Mat probMap(height, width, CV_32F, output.ptr(0, n)); Point2f p(-1, -1); Point maxLoc; double prob; minMaxLoc(probMap, NULL, &prob, NULL, &maxLoc); if (prob > thresh) { p = maxLoc; p.x *= (float)frameWidth / width; p.y *= (float)frameHeight / height; } points.push_back(p); } } // Otherwise the output is a vector of keypoints and we can just return it else { for (int n=0; n < nPoints; n++) { Point2f p; p.x = *output.ptr<float>(0, n, 0); p.y = *output.ptr<float>(0, n, 1); points.push_back(p); } } return points; } SegmentationModel::SegmentationModel(const String& model, const String& config) : Model(model, config) {}; SegmentationModel::SegmentationModel(const Net& network) : Model(network) {}; void SegmentationModel::segment(InputArray frame, OutputArray mask) { std::vector<Mat> outs; impl->processFrame(frame, outs); CV_Assert(outs.size() == 1); Mat score = outs[0]; const int chns = score.size[1]; const int rows = score.size[2]; const int cols = score.size[3]; mask.create(rows, cols, CV_8U); Mat classIds = mask.getMat(); classIds.setTo(0); Mat maxVal(rows, cols, CV_32F, score.data); for (int ch = 1; ch < chns; ch++) { for (int row = 0; row < rows; row++) { const float *ptrScore = score.ptr<float>(0, ch, row); uint8_t *ptrMaxCl = classIds.ptr<uint8_t>(row); float *ptrMaxVal = maxVal.ptr<float>(row); for (int col = 0; col < cols; col++) { if (ptrScore[col] > ptrMaxVal[col]) { ptrMaxVal[col] = ptrScore[col]; ptrMaxCl[col] = ch; } } } } } void disableRegionNMS(Net& net) { for (String& name : net.getUnconnectedOutLayersNames()) { int layerId = net.getLayerId(name); Ptr<RegionLayer> layer = net.getLayer(layerId).dynamicCast<RegionLayer>(); if (!layer.empty()) { layer->nmsThreshold = 0; } } } DetectionModel::DetectionModel(const String& model, const String& config) : Model(model, config) { disableRegionNMS(getNetwork_()); // FIXIT Move to DetectionModel::Impl::initNet() } DetectionModel::DetectionModel(const Net& network) : Model(network) { disableRegionNMS(getNetwork_()); // FIXIT Move to DetectionModel::Impl::initNet() } void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds, CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes, float confThreshold, float nmsThreshold) { std::vector<Mat> detections; impl->processFrame(frame, detections); boxes.clear(); confidences.clear(); classIds.clear(); int frameWidth = frame.cols(); int frameHeight = frame.rows(); if (getNetwork_().getLayer(0)->outputNameToIndex("im_info") != -1) { frameWidth = impl->size.width; frameHeight = impl->size.height; } std::vector<String> layerNames = getNetwork_().getLayerNames(); int lastLayerId = getNetwork_().getLayerId(layerNames.back()); Ptr<Layer> lastLayer = getNetwork_().getLayer(lastLayerId); if (lastLayer->type == "DetectionOutput") { // Network produces output blob with a shape 1x1xNx7 where N is a number of // detections and an every detection is a vector of values // [batchId, classId, confidence, left, top, right, bottom] for (int i = 0; i < detections.size(); ++i) { float* data = (float*)detections[i].data; for (int j = 0; j < detections[i].total(); j += 7) { float conf = data[j + 2]; if (conf < confThreshold) continue; int left = data[j + 3]; int top = data[j + 4]; int right = data[j + 5]; int bottom = data[j + 6]; int width = right - left + 1; int height = bottom - top + 1; if (width <= 2 || height <= 2) { left = data[j + 3] * frameWidth; top = data[j + 4] * frameHeight; right = data[j + 5] * frameWidth; bottom = data[j + 6] * frameHeight; width = right - left + 1; height = bottom - top + 1; } left = std::max(0, std::min(left, frameWidth - 1)); top = std::max(0, std::min(top, frameHeight - 1)); width = std::max(1, std::min(width, frameWidth - left)); height = std::max(1, std::min(height, frameHeight - top)); boxes.emplace_back(left, top, width, height); classIds.push_back(static_cast<int>(data[j + 1])); confidences.push_back(conf); } } } else if (lastLayer->type == "Region") { std::vector<int> predClassIds; std::vector<Rect> predBoxes; std::vector<float> predConf; for (int i = 0; i < detections.size(); ++i) { // Network produces output blob with a shape NxC where N is a number of // detected objects and C is a number of classes + 4 where the first 4 // numbers are [center_x, center_y, width, height] float* data = (float*)detections[i].data; for (int j = 0; j < detections[i].rows; ++j, data += detections[i].cols) { Mat scores = detections[i].row(j).colRange(5, detections[i].cols); Point classIdPoint; double conf; minMaxLoc(scores, nullptr, &conf, nullptr, &classIdPoint); if (static_cast<float>(conf) < confThreshold) continue; int centerX = data[0] * frameWidth; int centerY = data[1] * frameHeight; int width = data[2] * frameWidth; int height = data[3] * frameHeight; int left = std::max(0, std::min(centerX - width / 2, frameWidth - 1)); int top = std::max(0, std::min(centerY - height / 2, frameHeight - 1)); width = std::max(1, std::min(width, frameWidth - left)); height = std::max(1, std::min(height, frameHeight - top)); predClassIds.push_back(classIdPoint.x); predConf.push_back(static_cast<float>(conf)); predBoxes.emplace_back(left, top, width, height); } } if (nmsThreshold) { std::map<int, std::vector<size_t> > class2indices; for (size_t i = 0; i < predClassIds.size(); i++) { if (predConf[i] >= confThreshold) { class2indices[predClassIds[i]].push_back(i); } } for (const auto& it : class2indices) { std::vector<Rect> localBoxes; std::vector<float> localConfidences; for (size_t idx : it.second) { localBoxes.push_back(predBoxes[idx]); localConfidences.push_back(predConf[idx]); } std::vector<int> indices; NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices); classIds.resize(classIds.size() + indices.size(), it.first); for (int idx : indices) { boxes.push_back(localBoxes[idx]); confidences.push_back(localConfidences[idx]); } } } else { boxes = std::move(predBoxes); classIds = std::move(predClassIds); confidences = std::move(predConf); } } else CV_Error(Error::StsNotImplemented, "Unknown output layer type: \"" + lastLayer->type + "\""); } }} // namespace