From a371bdac9d9523cb1a6f7ecb1e4d0f18f9cdcd67 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Tue, 13 Jun 2023 16:06:19 +0100
Subject: [PATCH] Merge pull request #23766 from
 TolyaTalamanov:at/segmentation-demo-desync

G-API: Refine Semantic Segmentation Demo #23766

### Overview
* Supported demo working with camera id (e.g `--input=0`)
* Supported 3d output segmentation models (e.g `deeplabv3`)
* Supported `desync` execution
* Supported higher camera resolution
* Changed the color map to pascal voc (https://cloud.githubusercontent.com/assets/4503207/17803328/1006ca80-65f6-11e6-9ff6-36b7ef5b9ac6.png)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 .../include/opencv2/gapi/streaming/cap.hpp    |  31 ++-
 modules/gapi/misc/python/pyopencv_gapi.hpp    |   1 +
 .../gapi/samples/semantic_segmentation.cpp    | 202 +++++++++++++-----
 modules/python/src2/cv2_convert.hpp           |   5 +
 .../predefined_types.py                       |  10 +-
 5 files changed, 187 insertions(+), 62 deletions(-)

diff --git a/modules/gapi/include/opencv2/gapi/streaming/cap.hpp b/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
index adf1133c3f..9c2185c1ab 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
@@ -22,6 +22,7 @@
  * because of this file.
  */
 #include <chrono>
+#include <map>
 
 #include <opencv2/videoio.hpp>
 #include <opencv2/gapi/garg.hpp>
@@ -47,8 +48,16 @@ namespace wip {
 class GCaptureSource: public IStreamSource
 {
 public:
-    explicit GCaptureSource(int id) : cap(id) { prep(); }
-    explicit GCaptureSource(const std::string &path) : cap(path) { prep(); }
+    explicit GCaptureSource(int id, const std::map<int, double> &properties = {})
+        : cap(id) { prep(properties); }
+
+    explicit GCaptureSource(const std::string &path,
+                            const std::map<int, double> &properties = {})
+        : cap(path) { prep(properties); }
+
+    void set(int propid, double value) {
+        cap.set(propid, value);
+    }
 
     // TODO: Add more constructor overloads to make it
     // fully compatible with VideoCapture's interface.
@@ -59,8 +68,12 @@ protected:
     bool first_pulled = false;
     int64_t counter = 0;
 
-    void prep()
+    void prep(const std::map<int, double> &properties)
     {
+        for (const auto &it : properties) {
+            cap.set(it.first, it.second);
+        }
+
         // Prepare first frame to report its meta to engine
         // when needed
         GAPI_Assert(first.empty());
@@ -114,15 +127,19 @@ protected:
 };
 
 // NB: Overload for using from python
-GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const std::string& path)
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const std::string& path,
+                        const std::map<int, double>& properties = {})
 {
-    return make_src<GCaptureSource>(path);
+    return make_src<GCaptureSource>(path, properties);
 }
 
 // NB: Overload for using from python
-GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const int id)
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const int id,
+                        const std::map<int, double>& properties = {})
 {
-    return make_src<GCaptureSource>(id);
+    return make_src<GCaptureSource>(id, properties);
 }
 
 } // namespace wip
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index 736d8cc86e..70698ffd48 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -28,6 +28,7 @@ using map_string_and_string         = std::map<std::string, std::string>;
 using map_string_and_string         = std::map<std::string, std::string>;
 using map_string_and_vector_size_t  = std::map<std::string, std::vector<size_t>>;
 using map_string_and_vector_float   = std::map<std::string, std::vector<float>>;
+using map_int_and_double            = std::map<int, double>;
 
 // NB: Python wrapper generate T_U for T<U>
 // This behavior is only observed for inputs
diff --git a/modules/gapi/samples/semantic_segmentation.cpp b/modules/gapi/samples/semantic_segmentation.cpp
index fd3ec27750..db358816d1 100644
--- a/modules/gapi/samples/semantic_segmentation.cpp
+++ b/modules/gapi/samples/semantic_segmentation.cpp
@@ -5,34 +5,41 @@
 #include <opencv2/gapi/operators.hpp>
 #include <opencv2/highgui.hpp>
 
+#include <opencv2/gapi/streaming/desync.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
+
+#include <iomanip>
+
 const std::string keys =
     "{ h help |                                     | Print this help message }"
+    "{ desync | false                               | Desynchronize inference }"
     "{ input  |                                     | Path to the input video file }"
     "{ output |                                     | Path to the output video file }"
     "{ ssm    | semantic-segmentation-adas-0001.xml | Path to OpenVINO IE semantic segmentation model (.xml) }";
 
 // 20 colors for 20 classes of semantic-segmentation-adas-0001
-const std::vector<cv::Vec3b> colors = {
-    { 128, 64,  128 },
-    { 232, 35,  244 },
-    { 70,  70,  70 },
-    { 156, 102, 102 },
-    { 153, 153, 190 },
-    { 153, 153, 153 },
-    { 30,  170, 250 },
-    { 0,   220, 220 },
-    { 35,  142, 107 },
-    { 152, 251, 152 },
-    { 180, 130, 70 },
-    { 60,  20,  220 },
-    { 0,   0,   255 },
-    { 142, 0,   0 },
-    { 70,  0,   0 },
-    { 100, 60,  0 },
-    { 90,  0,   0 },
-    { 230, 0,   0 },
-    { 32,  11,  119 },
-    { 0,   74,  111 },
+static std::vector<cv::Vec3b> colors = {
+    { 0, 0, 0 },
+    { 0, 0, 128 },
+    { 0, 128, 0 },
+    { 0, 128, 128 },
+    { 128, 0, 0 },
+    { 128, 0, 128 },
+    { 128, 128, 0 },
+    { 128, 128, 128 },
+    { 0, 0, 64 },
+    { 0, 0, 192 },
+    { 0, 128, 64 },
+    { 0, 128, 192 },
+    { 128, 0, 64 },
+    { 128, 0, 192 },
+    { 128, 128, 64 },
+    { 128, 128, 192 },
+    { 0, 64, 0 },
+    { 0, 64, 128 },
+    { 0, 192, 0 },
+    { 0, 192, 128 },
+    { 128, 64, 0 }
 };
 
 namespace {
@@ -43,12 +50,23 @@ std::string get_weights_path(const std::string &model_path) {
 
     auto ext = model_path.substr(sz - EXT_LEN);
     std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c){
-            return static_cast<unsigned char>(std::tolower(c));
-        });
+        return static_cast<unsigned char>(std::tolower(c));
+    });
     CV_Assert(ext == ".xml");
     return model_path.substr(0u, sz - EXT_LEN) + ".bin";
 }
 
+bool isNumber(const std::string &str) {
+    return !str.empty() && std::all_of(str.begin(), str.end(),
+            [](unsigned char ch) { return std::isdigit(ch); });
+}
+
+std::string toStr(double value) {
+    std::stringstream ss;
+    ss << std::fixed << std::setprecision(1) << value;
+    return ss.str();
+}
+
 void classesToColors(const cv::Mat &out_blob,
                            cv::Mat &mask_img) {
     const int H = out_blob.size[0];
@@ -97,6 +115,25 @@ void probsToClasses(const cv::Mat& probs, cv::Mat& classes) {
 
 } // anonymous namespace
 
+namespace vis {
+
+static void putText(cv::Mat& mat, const cv::Point &position, const std::string &message) {
+    auto fontFace = cv::FONT_HERSHEY_COMPLEX;
+    int thickness = 2;
+    cv::Scalar color = {200, 10, 10};
+    double fontScale = 0.65;
+
+    cv::putText(mat, message, position, fontFace,
+                fontScale, cv::Scalar(255, 255, 255), thickness + 1);
+    cv::putText(mat, message, position, fontFace, fontScale, color, thickness);
+}
+
+static void drawResults(cv::Mat &img, const cv::Mat &color_mask) {
+    img = img / 2 + color_mask / 2;
+}
+
+} // namespace vis
+
 namespace custom {
 G_API_OP(PostProcessing, <cv::GMat(cv::GMat, cv::GMat)>, "sample.custom.post_processing") {
     static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &) {
@@ -106,19 +143,34 @@ G_API_OP(PostProcessing, <cv::GMat(cv::GMat, cv::GMat)>, "sample.custom.post_pro
 
 GAPI_OCV_KERNEL(OCVPostProcessing, PostProcessing) {
     static void run(const cv::Mat &in, const cv::Mat &out_blob, cv::Mat &out) {
+        int C = -1, H = -1, W = -1;
+        if (out_blob.size.dims() == 4u) {
+            C = 1; H = 2, W = 3;
+        } else if (out_blob.size.dims() == 3u) {
+            C = 0; H = 1, W = 2;
+        } else {
+            throw std::logic_error(
+                    "Number of dimmensions for model output must be 3 or 4!");
+        }
         cv::Mat classes;
         // NB: If output has more than single plane, it contains probabilities
         // otherwise class id.
-        if (out_blob.size[1] > 1) {
+        if (out_blob.size[C] > 1) {
             probsToClasses(out_blob, classes);
         } else {
-            out_blob.convertTo(classes, CV_8UC1);
-            classes = classes.reshape(1, out_blob.size[2]);
+            if (out_blob.depth() != CV_32S) {
+                throw std::logic_error(
+                        "Single channel output must have integer precision!");
+            }
+            cv::Mat view(out_blob.size[H], // cols
+                         out_blob.size[W], // rows
+                         CV_32SC1,
+                         out_blob.data);
+            view.convertTo(classes, CV_8UC1);
         }
-
         cv::Mat mask_img;
         classesToColors(classes, mask_img);
-        cv::resize(mask_img, out, in.size());
+        cv::resize(mask_img, out, in.size(), 0, 0, cv::INTER_NEAREST);
     }
 };
 } // namespace custom
@@ -134,6 +186,7 @@ int main(int argc, char *argv[]) {
     const std::string input  = cmd.get<std::string>("input");
     const std::string output = cmd.get<std::string>("output");
     const auto model_path    = cmd.get<std::string>("ssm");
+    const bool desync        = cmd.get<bool>("desync");
     const auto weights_path  = get_weights_path(model_path);
     const auto device        = "CPU";
     G_API_NET(SemSegmNet, <cv::GMat(cv::GMat)>, "semantic-segmentation");
@@ -145,40 +198,87 @@ int main(int argc, char *argv[]) {
 
     // Now build the graph
     cv::GMat in;
-    cv::GMat out_blob = cv::gapi::infer<SemSegmNet>(in);
-    cv::GMat post_proc_out = custom::PostProcessing::on(in, out_blob);
-    cv::GMat blending_in = in * 0.3f;
-    cv::GMat blending_out = post_proc_out * 0.7f;
-    cv::GMat out = blending_in + blending_out;
+    cv::GMat bgr = cv::gapi::copy(in);
+    cv::GMat frame = desync ? cv::gapi::streaming::desync(bgr) : bgr;
+    cv::GMat out_blob = cv::gapi::infer<SemSegmNet>(frame);
+    cv::GMat out = custom::PostProcessing::on(frame, out_blob);
 
-    cv::GStreamingCompiled pipeline = cv::GComputation(cv::GIn(in), cv::GOut(out))
-        .compileStreaming(cv::compile_args(kernels, networks));
-    auto inputs = cv::gin(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input));
+    cv::GStreamingCompiled pipeline = cv::GComputation(cv::GIn(in), cv::GOut(bgr, out))
+        .compileStreaming(cv::compile_args(kernels, networks,
+                          cv::gapi::streaming::queue_capacity{1}));
+
+    std::shared_ptr<cv::gapi::wip::GCaptureSource> source;
+    if (isNumber(input)) {
+        source = std::make_shared<cv::gapi::wip::GCaptureSource>(
+            std::stoi(input),
+            std::map<int, double> {
+              {cv::CAP_PROP_FRAME_WIDTH, 1280},
+              {cv::CAP_PROP_FRAME_HEIGHT, 720},
+              {cv::CAP_PROP_BUFFERSIZE, 1},
+              {cv::CAP_PROP_AUTOFOCUS, true}
+            }
+        );
+    } else {
+        source = std::make_shared<cv::gapi::wip::GCaptureSource>(input);
+    }
+    auto inputs = cv::gin(
+            static_cast<cv::gapi::wip::IStreamSource::Ptr>(source));
 
     // The execution part
     pipeline.setSource(std::move(inputs));
 
-    cv::VideoWriter writer;
     cv::TickMeter tm;
-    cv::Mat outMat;
+    cv::VideoWriter writer;
+
+    cv::util::optional<cv::Mat> color_mask;
+    cv::util::optional<cv::Mat> image;
+    cv::Mat last_image;
+    cv::Mat last_color_mask;
 
-    std::size_t frames = 0u;
-    tm.start();
     pipeline.start();
-    while (pipeline.pull(cv::gout(outMat))) {
-        ++frames;
-        cv::imshow("Out", outMat);
-        cv::waitKey(1);
-        if (!output.empty()) {
-            if (!writer.isOpened()) {
-                const auto sz = cv::Size{outMat.cols, outMat.rows};
-                writer.open(output, cv::VideoWriter::fourcc('M','J','P','G'), 25.0, sz);
-                CV_Assert(writer.isOpened());
+    tm.start();
+
+    std::size_t frames = 0u;
+    std::size_t masks  = 0u;
+    while (pipeline.pull(cv::gout(image, color_mask))) {
+        if (image.has_value()) {
+            ++frames;
+            last_image = std::move(*image);
+        }
+
+        if (color_mask.has_value()) {
+            ++masks;
+            last_color_mask = std::move(*color_mask);
+        }
+
+        if (!last_image.empty() && !last_color_mask.empty()) {
+            tm.stop();
+
+            std::string stream_fps = "Stream FPS: " + toStr(frames / tm.getTimeSec());
+            std::string inference_fps = "Inference FPS: " + toStr(masks  / tm.getTimeSec());
+
+            cv::Mat tmp = last_image.clone();
+
+            vis::drawResults(tmp, last_color_mask);
+            vis::putText(tmp, {10, 22}, stream_fps);
+            vis::putText(tmp, {10, 22 + 30}, inference_fps);
+
+            cv::imshow("Out", tmp);
+            cv::waitKey(1);
+            if (!output.empty()) {
+                if (!writer.isOpened()) {
+                    const auto sz = cv::Size{tmp.cols, tmp.rows};
+                    writer.open(output, cv::VideoWriter::fourcc('M','J','P','G'), 25.0, sz);
+                    CV_Assert(writer.isOpened());
+                }
+                writer << tmp;
             }
-            writer << outMat;
+
+            tm.start();
         }
     }
     tm.stop();
-    std::cout << "Processed " << frames << " frames" << " (" << frames / tm.getTimeSec() << " FPS)" << std::endl;
+    std::cout << "Processed " << frames << " frames" << " ("
+              << frames / tm.getTimeSec()<< " FPS)" << std::endl;
     return 0;
 }
diff --git a/modules/python/src2/cv2_convert.hpp b/modules/python/src2/cv2_convert.hpp
index 73ef10968b..43ef7b2302 100644
--- a/modules/python/src2/cv2_convert.hpp
+++ b/modules/python/src2/cv2_convert.hpp
@@ -268,6 +268,11 @@ PyObject* pyopencv_from(const std::vector<Tp>& value)
 template<typename K, typename V>
 bool pyopencv_to(PyObject *obj, std::map<K,V> &map, const ArgInfo& info)
 {
+    if (!obj || obj == Py_None)
+    {
+        return true;
+    }
+
     PyObject* py_key = nullptr;
     PyObject* py_value = nullptr;
     Py_ssize_t pos = 0;
diff --git a/modules/python/src2/typing_stubs_generation/predefined_types.py b/modules/python/src2/typing_stubs_generation/predefined_types.py
index 39a641c021..2ebe08d162 100644
--- a/modules/python/src2/typing_stubs_generation/predefined_types.py
+++ b/modules/python/src2/typing_stubs_generation/predefined_types.py
@@ -191,13 +191,15 @@ _PREDEFINED_TYPES = (
                             PrimitiveTypeNode.str_())
                         ), export_name="SearchParams"),
     AliasTypeNode.dict_("map_string_and_string", PrimitiveTypeNode.str_("map_string_and_string::key"),
-                        PrimitiveTypeNode.str_("map_string_and_string::key::value")),
+                        PrimitiveTypeNode.str_("map_string_and_string::value")),
     AliasTypeNode.dict_("map_string_and_int", PrimitiveTypeNode.str_("map_string_and_int::key"),
-                        PrimitiveTypeNode.int_("map_string_and_int::key::value")),
+                        PrimitiveTypeNode.int_("map_string_and_int::value")),
     AliasTypeNode.dict_("map_string_and_vector_size_t", PrimitiveTypeNode.str_("map_string_and_vector_size_t::key"),
-                        SequenceTypeNode("map_string_and_vector_size_t::key::value", PrimitiveTypeNode.int_("size_t"))),
+                        SequenceTypeNode("map_string_and_vector_size_t::value", PrimitiveTypeNode.int_("size_t"))),
     AliasTypeNode.dict_("map_string_and_vector_float", PrimitiveTypeNode.str_("map_string_and_vector_float::key"),
-                        SequenceTypeNode("map_string_and_vector_float::key::value", PrimitiveTypeNode.float_())),
+                        SequenceTypeNode("map_string_and_vector_float::value", PrimitiveTypeNode.float_())),
+    AliasTypeNode.dict_("map_int_and_double", PrimitiveTypeNode.int_("map_int_and_double::key"),
+                        PrimitiveTypeNode.float_("map_int_and_double::value")),
 )
 
 PREDEFINED_TYPES = dict(zip((t.ctype_name for t in _PREDEFINED_TYPES), _PREDEFINED_TYPES))