From 97e88bd769e459c1a7b0c2d45d6b20f9030ff227 Mon Sep 17 00:00:00 2001
From: Anton Potapov <anton.potapov@intel.com>
Date: Wed, 3 Jul 2019 11:35:54 +0300
Subject: [PATCH] Fluid Internal Parallelism

 - Added new graph compile time argument to specify multiple independent
ROIs (Tiles)
 - Added new "executable" with serial loop other user specified
ROIs(Tiles)
 - refactored graph traversal code into separate function to be called
once
 - added saturate cast to Fluid AddCsimple test kernel
---
 .../opencv2/gapi/fluid/gfluidkernel.hpp       |  10 +
 .../gapi/src/backends/fluid/gfluidbackend.cpp | 137 +++++++++---
 .../gapi/src/backends/fluid/gfluidbackend.hpp |  49 ++++-
 .../test/gapi_fluid_parallel_rois_test.cpp    | 204 ++++++++++++++++++
 modules/gapi/test/gapi_fluid_test_kernels.cpp |   4 +-
 5 files changed, 370 insertions(+), 34 deletions(-)
 create mode 100644 modules/gapi/test/gapi_fluid_parallel_rois_test.cpp
diff --git a/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp b/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
index 1d8bfd80f0..18c00d9913 100644
--- a/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
@@ -99,12 +99,22 @@ struct GFluidOutputRois
     std::vector<cv::gapi::own::Rect> rois;
 };
 
+struct GFluidParallelOutputRois
+{
+    std::vector<GFluidOutputRois> parallel_rois;
+};
+
 namespace detail
 {
 template<> struct CompileArgTag<GFluidOutputRois>
 {
     static const char* tag() { return "gapi.fluid.outputRois"; }
 };
+template<> struct CompileArgTag<GFluidParallelOutputRois>
+{
+    static const char* tag() { return "gapi.fluid.parallelOutputRois"; }
+};
+
 } // namespace detail
 
 namespace detail
diff --git a/modules/gapi/src/backends/fluid/gfluidbackend.cpp b/modules/gapi/src/backends/fluid/gfluidbackend.cpp
index cffd6e6d3d..0ec05c2410 100644
--- a/modules/gapi/src/backends/fluid/gfluidbackend.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidbackend.cpp
@@ -91,7 +91,13 @@ namespace
                 cv::util::throw_error(std::logic_error("GFluidOutputRois feature supports only one-island graphs"));
 
             auto rois = out_rois.value_or(cv::GFluidOutputRois());
-            return EPtr{new cv::gimpl::GFluidExecutable(graph, nodes, std::move(rois.rois))};
+
+            auto graph_data = fluidExtractInputDataFromGraph(graph, nodes);
+            const auto parallel_out_rois = cv::gimpl::getCompileArg<cv::GFluidParallelOutputRois>(args);
+            return parallel_out_rois.has_value() ?
+                       EPtr{new cv::gimpl::GParallelFluidExecutable (graph, graph_data, std::move(parallel_out_rois.value().parallel_rois))}
+                     : EPtr{new cv::gimpl::GFluidExecutable         (graph, graph_data, std::move(rois.rois))}
+            ;
         }
 
         virtual void addBackendPasses(ade::ExecutionEngineSetupContext &ectx) override;
@@ -700,27 +706,31 @@ void cv::gimpl::GFluidExecutable::initBufferRois(std::vector<int>& readStarts,
     } // while (!nodesToVisit.empty())
 }
 
-cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
-                                              const std::vector<ade::NodeHandle> &nodes,
-                                              const std::vector<cv::gapi::own::Rect> &outputRois)
-    : m_g(g), m_gm(m_g)
+cv::gimpl::FluidGraphInputData cv::gimpl::fluidExtractInputDataFromGraph(const ade::Graph &g, const std::vector<ade::NodeHandle> &nodes)
 {
-    GConstFluidModel fg(m_g);
+    decltype(FluidGraphInputData::m_agents_data)       agents_data;
+    decltype(FluidGraphInputData::m_scratch_users)     scratch_users;
+    decltype(FluidGraphInputData::m_id_map)            id_map;
+    decltype(FluidGraphInputData::m_all_gmat_ids)      all_gmat_ids;
+    std::size_t                                        mat_count = 0;
+
+    GConstFluidModel fg(g);
+    GModel::ConstGraph m_gm(g);
 
     // Initialize vector of data buffers, build list of operations
     // FIXME: There _must_ be a better way to [query] count number of DATA nodes
-    std::size_t mat_count = 0;
-    std::size_t last_agent = 0;
 
     auto grab_mat_nh = [&](ade::NodeHandle nh) {
         auto rc = m_gm.metadata(nh).get<Data>().rc;
-        if (m_id_map.count(rc) == 0)
+        if (id_map.count(rc) == 0)
         {
-            m_all_gmat_ids[mat_count] = nh;
-            m_id_map[rc] = mat_count++;
+            all_gmat_ids[mat_count] = nh;
+            id_map[rc] = mat_count++;
         }
     };
 
+    std::size_t last_agent = 0;
+
     for (const auto &nh : nodes)
     {
         switch (m_gm.metadata(nh).get<NodeType>().t)
@@ -733,15 +743,10 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
         case NodeType::OP:
         {
             const auto& fu = fg.metadata(nh).get<FluidUnit>();
-            switch (fu.k.m_kind)
-            {
-            case GFluidKernel::Kind::Filter:    m_agents.emplace_back(new FluidFilterAgent(m_g, nh));    break;
-            case GFluidKernel::Kind::Resize:    m_agents.emplace_back(new FluidResizeAgent(m_g, nh));    break;
-            case GFluidKernel::Kind::NV12toRGB: m_agents.emplace_back(new FluidNV12toRGBAgent(m_g, nh)); break;
-            default: GAPI_Assert(false);
-            }
+
+            agents_data.push_back({fu.k.m_kind, nh, {}, {}});
             // NB.: in_buffer_ids size is equal to Arguments size, not Edges size!!!
-            m_agents.back()->in_buffer_ids.resize(m_gm.metadata(nh).get<Op>().args.size(), -1);
+            agents_data.back().in_buffer_ids.resize(m_gm.metadata(nh).get<Op>().args.size(), -1);
             for (auto eh : nh->inEdges())
             {
                 // FIXME Only GMats are currently supported (which can be represented
@@ -751,23 +756,23 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
                     const auto in_port = m_gm.metadata(eh).get<Input>().port;
                     const int  in_buf  = m_gm.metadata(eh->srcNode()).get<Data>().rc;
 
-                    m_agents.back()->in_buffer_ids[in_port] = in_buf;
+                    agents_data.back().in_buffer_ids[in_port] = in_buf;
                     grab_mat_nh(eh->srcNode());
                 }
             }
             // FIXME: Assumption that all operation outputs MUST be connected
-            m_agents.back()->out_buffer_ids.resize(nh->outEdges().size(), -1);
+            agents_data.back().out_buffer_ids.resize(nh->outEdges().size(), -1);
             for (auto eh : nh->outEdges())
             {
                 const auto& data = m_gm.metadata(eh->dstNode()).get<Data>();
                 const auto out_port = m_gm.metadata(eh).get<Output>().port;
                 const int  out_buf  = data.rc;
 
-                m_agents.back()->out_buffer_ids[out_port] = out_buf;
+                agents_data.back().out_buffer_ids[out_port] = out_buf;
                 if (data.shape == GShape::GMAT) grab_mat_nh(eh->dstNode());
             }
             if (fu.k.m_scratch)
-                m_scratch_users.push_back(last_agent);
+                scratch_users.push_back(last_agent);
             last_agent++;
             break;
         }
@@ -776,12 +781,50 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
     }
 
     // Check that IDs form a continiuos set (important for further indexing)
-    GAPI_Assert(m_id_map.size() >  0);
-    GAPI_Assert(m_id_map.size() == static_cast<size_t>(mat_count));
+    GAPI_Assert(id_map.size() >  0);
+    GAPI_Assert(id_map.size() == static_cast<size_t>(mat_count));
+
+    return FluidGraphInputData {std::move(agents_data), std::move(scratch_users), std::move(id_map), std::move(all_gmat_ids), mat_count};
+}
+
+cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph                       &g,
+                                              const cv::gimpl::FluidGraphInputData   &traverse_res,
+                                              const std::vector<cv::gapi::own::Rect> &outputRois)
+    : m_g(g), m_gm(m_g)
+{
+    GConstFluidModel fg(m_g);
+
+    auto tie_traverse_res = [&traverse_res](){
+        auto& r = traverse_res;
+        return std::tie(r.m_scratch_users, r.m_id_map, r.m_all_gmat_ids, r.m_mat_count);
+    };
+
+    auto tie_this   =  [this](){
+        return std::tie(m_scratch_users, m_id_map, m_all_gmat_ids, m_num_int_buffers);
+    };
+
+    tie_this() = tie_traverse_res();
+
+    auto create_fluid_agent = [&g](agent_data_t const& agent_data) -> std::unique_ptr<FluidAgent> {
+        std::unique_ptr<FluidAgent> agent_ptr;
+        switch (agent_data.kind)
+        {
+            case GFluidKernel::Kind::Filter:    agent_ptr.reset(new FluidFilterAgent(g, agent_data.nh));      break;
+            case GFluidKernel::Kind::Resize:    agent_ptr.reset(new FluidResizeAgent(g, agent_data.nh));      break;
+            case GFluidKernel::Kind::NV12toRGB: agent_ptr.reset(new FluidNV12toRGBAgent(g, agent_data.nh));   break;
+            default: GAPI_Assert(false);
+        }
+        std::tie(agent_ptr->in_buffer_ids, agent_ptr->out_buffer_ids) = std::tie(agent_data.in_buffer_ids, agent_data.out_buffer_ids);
+        return agent_ptr;
+    };
+
+    for (auto const& agent_data : traverse_res.m_agents_data){
+        m_agents.push_back(create_fluid_agent(agent_data));
+    }
 
     // Actually initialize Fluid buffers
-    GAPI_LOG_INFO(NULL, "Initializing " << mat_count << " fluid buffer(s)" << std::endl);
-    m_num_int_buffers = mat_count;
+    GAPI_LOG_INFO(NULL, "Initializing " << m_num_int_buffers << " fluid buffer(s)" << std::endl);
+
     const std::size_t num_scratch = m_scratch_users.size();
     m_buffers.resize(m_num_int_buffers + num_scratch);
 
@@ -847,6 +890,12 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
 
     makeReshape(outputRois);
 
+    GAPI_LOG_INFO(NULL, "Internal buffers: " << std::fixed << std::setprecision(2) << static_cast<float>(total_buffers_size())/1024 << " KB\n");
+}
+
+std::size_t cv::gimpl::GFluidExecutable::total_buffers_size() const
+{
+    GConstFluidModel fg(m_g);
     std::size_t total_size = 0;
     for (const auto &i : ade::util::indexed(m_buffers))
     {
@@ -854,7 +903,7 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
         const auto idx = ade::util::index(i);
         const auto b   = ade::util::value(i);
         if (idx >= m_num_int_buffers ||
-            fg.metadata(m_all_gmat_ids[idx]).get<FluidData>().internal == true)
+            fg.metadata(m_all_gmat_ids.at(idx)).get<FluidData>().internal == true)
         {
             GAPI_Assert(b.priv().size() > 0);
         }
@@ -863,7 +912,7 @@ cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
         // (There can be non-zero sized const border buffer allocated in such buffers)
         total_size += b.priv().size();
     }
-    GAPI_LOG_INFO(NULL, "Internal buffers: " << std::fixed << std::setprecision(2) << static_cast<float>(total_size)/1024 << " KB\n");
+    return total_size;
 }
 
 namespace
@@ -1196,6 +1245,11 @@ void cv::gimpl::GFluidExecutable::packArg(cv::GArg &in_arg, const cv::GArg &op_a
 
 void cv::gimpl::GFluidExecutable::run(std::vector<InObj>  &&input_objs,
                                       std::vector<OutObj> &&output_objs)
+{
+    run(input_objs, output_objs);
+}
+void cv::gimpl::GFluidExecutable::run(std::vector<InObj>  &input_objs,
+                                      std::vector<OutObj> &output_objs)
 {
     // Bind input buffers from parameters
     for (auto& it : input_objs)  bindInArg(it.first, it.second);
@@ -1269,6 +1323,31 @@ void cv::gimpl::GFluidExecutable::run(std::vector<InObj>  &&input_objs,
     }
 }
 
+cv::gimpl::GParallelFluidExecutable::GParallelFluidExecutable(const ade::Graph                      &g,
+                                                              const FluidGraphInputData             &graph_data,
+                                                              const std::vector<GFluidOutputRois>   &parallelOutputRois)
+{
+    for (auto&& rois : parallelOutputRois){
+        tiles.emplace_back(g, graph_data, rois.rois);
+    }
+}
+
+
+void cv::gimpl::GParallelFluidExecutable::reshape(ade::Graph&, const GCompileArgs& )
+{
+    //TODO: implement ?
+    GAPI_Assert(false && "Not Implemented;");
+}
+
+void cv::gimpl::GParallelFluidExecutable::run(std::vector<InObj>  &&input_objs,
+                                              std::vector<OutObj> &&output_objs)
+{
+    for (auto& tile : tiles ){
+        tile.run(input_objs, output_objs);
+    }
+}
+
+
 // FIXME: these passes operate on graph global level!!!
 // Need to fix this for heterogeneous (island-based) processing
 void GFluidBackendImpl::addBackendPasses(ade::ExecutionEngineSetupContext &ectx)
diff --git a/modules/gapi/src/backends/fluid/gfluidbackend.hpp b/modules/gapi/src/backends/fluid/gfluidbackend.hpp
index 64242cd215..ad5e5f837d 100644
--- a/modules/gapi/src/backends/fluid/gfluidbackend.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidbackend.hpp
@@ -51,6 +51,13 @@ struct FluidData
     gapi::fluid::BorderOpt border;
 };
 
+struct agent_data_t {
+     GFluidKernel::Kind  kind;
+     ade::NodeHandle     nh;
+     std::vector<int>    in_buffer_ids;
+     std::vector<int>    out_buffer_ids;
+ };
+
 struct FluidAgent
 {
 public:
@@ -96,6 +103,19 @@ private:
     virtual std::pair<int,int> linesReadAndnextWindow(std::size_t inPort) const = 0;
 };
 
+//helper data structure for accumulating graph traversal/analysis data
+struct FluidGraphInputData {
+
+    std::vector<agent_data_t>               m_agents_data;
+    std::vector<std::size_t>                m_scratch_users;
+    std::unordered_map<int, std::size_t>    m_id_map;           // GMat id -> buffer idx map
+    std::map<std::size_t, ade::NodeHandle>  m_all_gmat_ids;
+
+    std::size_t                             m_mat_count;
+};
+//local helper function to traverse the graph once and pass the results to multiple instances of GFluidExecutable
+FluidGraphInputData fluidExtractInputDataFromGraph(const ade::Graph &m_g, const std::vector<ade::NodeHandle> &nodes);
+
 class GFluidExecutable final: public GIslandExecutable
 {
     const ade::Graph &m_g;
@@ -121,15 +141,36 @@ class GFluidExecutable final: public GIslandExecutable
 
     void initBufferRois(std::vector<int>& readStarts, std::vector<cv::gapi::own::Rect>& rois, const std::vector<gapi::own::Rect> &out_rois);
     void makeReshape(const std::vector<cv::gapi::own::Rect>& out_rois);
+    std::size_t total_buffers_size() const;
 
 public:
-    GFluidExecutable(const ade::Graph &g,
-                     const std::vector<ade::NodeHandle> &nodes,
-                     const std::vector<cv::gapi::own::Rect> &outputRois);
-
     virtual inline bool canReshape() const override { return true; }
     virtual void reshape(ade::Graph& g, const GCompileArgs& args) override;
 
+    virtual void run(std::vector<InObj>  &&input_objs,
+                     std::vector<OutObj> &&output_objs) override;
+
+    void run(std::vector<InObj>  &input_objs,
+             std::vector<OutObj> &output_objs);
+
+
+     GFluidExecutable(const ade::Graph                          &g,
+                      const FluidGraphInputData                 &graph_data,
+                      const std::vector<cv::gapi::own::Rect>    &outputRois);
+};
+
+
+class GParallelFluidExecutable final: public GIslandExecutable {
+    std::vector<GFluidExecutable> tiles;
+public:
+    GParallelFluidExecutable(const ade::Graph                       &g,
+                             const FluidGraphInputData              &graph_data,
+                             const std::vector<GFluidOutputRois>    &parallelOutputRois);
+
+
+    virtual inline bool canReshape() const override { return false; }
+    virtual void reshape(ade::Graph& g, const GCompileArgs& args) override;
+
     virtual void run(std::vector<InObj>  &&input_objs,
                      std::vector<OutObj> &&output_objs) override;
 };
diff --git a/modules/gapi/test/gapi_fluid_parallel_rois_test.cpp b/modules/gapi/test/gapi_fluid_parallel_rois_test.cpp
new file mode 100644
index 0000000000..9a8d07149c
--- /dev/null
+++ b/modules/gapi/test/gapi_fluid_parallel_rois_test.cpp
@@ -0,0 +1,204 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "gapi_fluid_test_kernels.hpp"
+
+namespace opencv_test
+{
+
+namespace {
+    cv::Mat randomMat(cv::Size img_sz, int type = CV_8UC1, cv::Scalar mean   = cv::Scalar(127.0f), cv::Scalar stddev = cv::Scalar(40.f)){
+        cv::Mat mat(img_sz, type);
+        cv::randn(mat, mean, stddev);
+        return mat;
+    }
+
+    cv::GFluidParallelOutputRois asGFluidParallelOutputRois(const std::vector<cv::Rect>& rois){
+        cv::GFluidParallelOutputRois parallel_rois;
+        for (auto const& roi : rois) {
+            parallel_rois.parallel_rois.emplace_back(GFluidOutputRois{{to_own(roi)}});
+        }
+        return parallel_rois;
+    }
+
+    void adjust_empty_roi(cv::Rect& roi, cv::Size size){
+        if (roi.empty()) roi = cv::Rect{{0,0}, size};
+    }
+}
+using namespace cv::gapi_test_kernels;
+
+//As GTest can not simultaneously parameterize test with both types and values - lets use type-erasure and virtual interfaces
+//to use different computation pipelines
+struct ComputationPair {
+    virtual void run_with_gapi(const cv::Mat& in_mat, cv::GFluidParallelOutputRois const& parallel_rois, cv::Mat& out_mat) = 0;
+    virtual void run_with_ocv (const cv::Mat& in_mat, const std::vector<cv::Rect>& rois,                 cv::Mat& out_mat) = 0;
+
+    virtual std::string name() const { return {}; }
+
+    virtual ~ComputationPair ()  = default;
+
+    friend std::ostream& operator<<(std::ostream& o, ComputationPair const* cp){
+        std::string custom_name = cp->name();
+        return o << (custom_name.empty() ? typeid(cp).name() : custom_name );
+    }
+};
+
+struct Blur3x3CP  : ComputationPair{
+    static constexpr int borderType = BORDER_REPLICATE;
+    static constexpr int kernelSize = 3;
+
+    std::string name() const override { return "Blur3x3"; }
+    void run_with_gapi(const cv::Mat& in_mat, cv::GFluidParallelOutputRois const& parallel_rois, cv::Mat& out_mat_gapi) override {
+        cv::GMat in;
+        cv::GMat out = TBlur3x3::on(in, borderType, {});
+        cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+        // Run G-API
+        auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage, parallel_rois));
+        cc(cv::gin(in_mat), cv::gout(out_mat_gapi));
+    }
+
+    void run_with_ocv(const cv::Mat& in_mat, const std::vector<cv::Rect>& rois, cv::Mat& out_mat_ocv) override {
+        cv::Point anchor = {-1, -1};
+        // Check with OpenCV
+        for (auto roi : rois) {
+            adjust_empty_roi(roi, in_mat.size());
+            cv::blur(in_mat(roi), out_mat_ocv(roi), {kernelSize, kernelSize}, anchor, borderType);
+        }
+    }
+};
+
+struct AddCCP : ComputationPair{
+    std::string name() const override { return "AddC"; }
+    void run_with_gapi(const cv::Mat& in_mat, cv::GFluidParallelOutputRois const& parallel_rois, cv::Mat& out_mat_gapi) override {
+        cv::GMat in;
+        cv::GMat out = TAddCSimple::on(in, 1);
+        cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+        // Run G-API
+        auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage, parallel_rois));
+        cc(cv::gin(in_mat), cv::gout(out_mat_gapi));
+    }
+
+    void run_with_ocv(const cv::Mat& in_mat, const std::vector<cv::Rect>& rois, cv::Mat& out_mat_ocv) override {
+        // Check with OpenCV
+        for (auto roi : rois) {
+            adjust_empty_roi(roi, in_mat.size());
+            out_mat_ocv(roi) = in_mat(roi) + 1u;
+        }
+    }
+};
+
+template<BorderTypes _borderType>
+struct SequenceOfBlursCP : ComputationPair{
+    BorderTypes borderType = _borderType;
+
+    std::string name() const override { return "SequenceOfBlurs, border type: " + std::to_string(static_cast<int>(borderType)); }
+    void run_with_gapi(const cv::Mat& in_mat, cv::GFluidParallelOutputRois const& parallel_rois, cv::Mat& out_mat) override {
+        cv::Scalar borderValue(0);
+
+        GMat in;
+        auto mid = TBlur3x3::on(in,  borderType, borderValue);
+        auto out = TBlur5x5::on(mid, borderType, borderValue);
+
+        GComputation c(GIn(in), GOut(out));
+        auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage, parallel_rois));
+        cc(cv::gin(in_mat), cv::gout(out_mat));
+    }
+    void run_with_ocv (const cv::Mat& in_mat, const std::vector<cv::Rect>& rois,                 cv::Mat& out_mat) override {
+        cv::Mat mid_mat_ocv = Mat::zeros(in_mat.size(), in_mat.type());
+        cv::Point anchor = {-1, -1};
+
+        for (auto roi : rois) {
+            adjust_empty_roi(roi, in_mat.size());
+            cv::blur(in_mat, mid_mat_ocv, {3,3}, anchor, borderType);
+            cv::blur(mid_mat_ocv(roi), out_mat(roi), {5,5}, anchor, borderType);
+        }
+    }
+};
+
+struct TiledComputation : public TestWithParam <std::tuple<ComputationPair*, cv::Size, std::vector<cv::Rect>>> {};
+TEST_P(TiledComputation, Test)
+{
+    ComputationPair*        cp;
+    cv::Size                img_sz;
+    std::vector<cv::Rect>   rois ;
+    auto                    mat_type  =  CV_8UC1;
+
+    std::tie(cp, img_sz, rois) = GetParam();
+
+    cv::Mat in_mat       =      randomMat(img_sz, mat_type);
+    cv::Mat out_mat_gapi = cv::Mat::zeros(img_sz, mat_type);
+    cv::Mat out_mat_ocv  = cv::Mat::zeros(img_sz, mat_type);
+
+    cp->run_with_gapi(in_mat, asGFluidParallelOutputRois(rois),  out_mat_gapi);
+    cp->run_with_ocv (in_mat, rois,                              out_mat_ocv);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv))
+            << "in_mat : \n"      << in_mat << std::endl
+            << "diff matrix :\n " << (out_mat_gapi != out_mat_ocv) << std::endl
+            << "out_mat_gapi: \n" << out_mat_gapi << std::endl
+            << "out_mat_ocv:  \n" << out_mat_ocv << std::endl;
+}
+namespace {
+    //this is ugly but other variants (like using shared_ptr) are IMHO even more ugly :)
+    template<typename T, typename... Arg>
+    T* addr_of_static(Arg... arg) {
+        static T obj(std::forward<Arg>(arg)...);
+        return &obj;
+    }
+}
+
+auto single_arg_computations = [](){
+    return Values(  addr_of_static<Blur3x3CP>(),
+                    addr_of_static<AddCCP>(),
+                    addr_of_static<SequenceOfBlursCP<BORDER_CONSTANT>>(),
+                    addr_of_static<SequenceOfBlursCP<BORDER_REPLICATE>>(),
+                    addr_of_static<SequenceOfBlursCP<BORDER_REFLECT_101>>()
+            );
+
+};
+
+INSTANTIATE_TEST_CASE_P(FluidTiledSerial8x10, TiledComputation,
+                        Combine(
+                            single_arg_computations(),
+                            Values(cv::Size(8, 10)),
+                            Values(std::vector<cv::Rect>{cv::Rect{}},
+                                   std::vector<cv::Rect>{cv::Rect{0,0,8,5}, cv::Rect{0,5,8,5}},
+                                   std::vector<cv::Rect>{cv::Rect{0,1,8,3}, cv::Rect{0,4,8,3}},
+                                   std::vector<cv::Rect>{cv::Rect{0,2,8,3}, cv::Rect{0,5,8,2}},
+                                   std::vector<cv::Rect>{cv::Rect{0,3,8,4}, cv::Rect{0,9,8,1}}))
+);
+
+INSTANTIATE_TEST_CASE_P(FluidTiledSerial20x15, TiledComputation,
+                        Combine(
+                            single_arg_computations(),
+                            Values(cv::Size(20, 15)),
+                            Values(std::vector<cv::Rect>{cv::Rect{}},
+                                   std::vector<cv::Rect>{cv::Rect{{0,0},cv::Size{20,7}},
+                                                         cv::Rect{{0,7},cv::Size{20,8}}}))
+);
+
+INSTANTIATE_TEST_CASE_P(FluidTiledSerial320x240, TiledComputation,
+                        Combine(
+                            single_arg_computations(),
+                            Values(cv::Size(320, 240)),
+                            Values(std::vector<cv::Rect>{cv::Rect{{0,0},   cv::Size{320,120}},
+                                                         cv::Rect{{0,120}, cv::Size{320,120}}},
+
+                                   std::vector<cv::Rect>{cv::Rect{{0,0},   cv::Size{320,120}},
+                                                         cv::Rect{{0,120}, cv::Size{320,120}}},
+
+                                   std::vector<cv::Rect>{cv::Rect{{0,0},  cv::Size{320,60}},
+                                                         cv::Rect{{0,60}, cv::Size{320,60}},
+                                                         cv::Rect{{0,120},cv::Size{320,120}}}))
+);
+
+//FIXME: add multiple outputs tests
+} // namespace opencv_test
diff --git a/modules/gapi/test/gapi_fluid_test_kernels.cpp b/modules/gapi/test/gapi_fluid_test_kernels.cpp
index fcc8d9b058..7c4904cc5e 100644
--- a/modules/gapi/test/gapi_fluid_test_kernels.cpp
+++ b/modules/gapi/test/gapi_fluid_test_kernels.cpp
@@ -9,6 +9,7 @@
 #include <iomanip>
 #include "gapi_fluid_test_kernels.hpp"
 #include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/own/saturate.hpp>
 
 namespace cv
 {
@@ -72,7 +73,8 @@ GAPI_FLUID_KERNEL(FAddCSimple, TAddCSimple, false)
             for (int i = 0, w = in.length(); i < w; i++)
             {
                 //std::cout << std::setw(4) << int(in_row[i]);
-                out_row[i] = static_cast<uint8_t>(in_row[i] + cval);
+                //FIXME: it seems that over kernels might need it as well
+                out_row[i] = cv::gapi::own::saturate<uint8_t>(in_row[i] + cval);
             }
             //std::cout << std::endl;
         }