From cbdaa93e548ff13e28d982971f05f5d510d88a4b Mon Sep 17 00:00:00 2001
From: YashasSamaga <yashas_2010@yahoo.com>
Date: Sun, 5 Jul 2020 20:52:35 +0530
Subject: [PATCH] reduce slice, concat to copy; enable more concat fusions

---
 modules/dnn/src/cuda/concat.cu                | 16 ++++++++++
 modules/dnn/src/cuda/slice.cu                 | 32 +++++++++++++++++++
 modules/dnn/src/cuda4dnn/primitives/slice.hpp | 14 --------
 modules/dnn/src/dnn.cpp                       |  8 ++++-
 4 files changed, 55 insertions(+), 15 deletions(-)
diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu
index 7d5955c6a2..ac1be75682 100644
--- a/modules/dnn/src/cuda/concat.cu
+++ b/modules/dnn/src/cuda/concat.cu
@@ -16,6 +16,8 @@
 #include "../cuda4dnn/csl/tensor.hpp"
 #include "../cuda4dnn/csl/span.hpp"
 
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
 #include <cstddef>
 #include <vector>
 
@@ -95,6 +97,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         TensorSpan<T> output, std::size_t output_axis_offset,
         TensorView<T> input, std::size_t axis)
     {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output_axis_offset < output.get_axis_size(axis));
+
+        /* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
+         * in the output and we can copy each block directly
+         */
+        if (output.size_range(0, axis) == 1)
+        {
+            auto stride = output.size_range(axis + 1, output.rank());
+            auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
+            kernels::copy<T>(stream, sliced_output, input);
+            return;
+        }
+
         /* let's call the axis of interest as the channel axis for the purpose of the following discussion
          * even though it can be any axis
          *
diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu
index 5375345bd8..37b718cd63 100644
--- a/modules/dnn/src/cuda/slice.cu
+++ b/modules/dnn/src/cuda/slice.cu
@@ -15,11 +15,14 @@
 #include "../cuda4dnn/csl/tensor.hpp"
 #include "../cuda4dnn/csl/span.hpp"
 
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
 #include <opencv2/core.hpp>
 
 #include <cstddef>
 #include <vector>
 #include <iostream>
+#include <algorithm>
 
 using namespace cv::dnn::cuda4dnn::csl;
 using namespace cv::dnn::cuda4dnn::csl::device;
@@ -79,6 +82,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
         CV_Assert(output.rank() == input.rank());
         CV_Assert(output.rank() == offsets.size());
 
+        /* copy directly if no slicing is required */
+        if (is_shape_same(output, input))
+        {
+            CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
+            kernels::copy<T>(stream, output, input);
+            return;
+        }
+
         /* squeezable axes at the beginning of both tensors can be eliminated
          *
          * Reasoning:
@@ -146,6 +157,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
         auto rank = inShape.size();
 
+        /* We can do a copy if the reduced rank is two and only the first axis is sliced.
+         * The general requirement is that only one axis is sliced and all the axes that
+         * preceed the sliced axis are singleton. However, the reductions above will remove
+         * all the leading singleton axes and merge the trailing unsliced axes into one, or
+         * zero if there are no trailing unsliced axes. The latter is handled separately.
+         */
+        if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
+        {
+            auto stride = inShape[1];
+            auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
+        if (rank == 1)
+        {
+            auto sliced_input = View<T>(input.get() + offsets[0], output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
         std::vector<std::size_t> inStride(rank), outStride(rank);
         inStride.back() = 1;
         outStride.back() = 1;
diff --git a/modules/dnn/src/cuda4dnn/primitives/slice.hpp b/modules/dnn/src/cuda4dnn/primitives/slice.hpp
index 900b580b45..f83d4adf44 100644
--- a/modules/dnn/src/cuda4dnn/primitives/slice.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/slice.hpp
@@ -47,20 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
             CV_Assert(offsets.size() == outputs.size());
 
-            /* one output with the same shape as the input => direct copy */
-            if (outputs.size() == 1)
-            {
-                auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
-                auto output = output_wrapper->getSpan();
-
-                if (is_shape_same(output, input))
-                {
-                    CV_Assert(std::all_of(std::begin(offsets[0]), std::end(offsets[0]), [] (std::size_t x) { return x == 0; }));
-                    kernels::copy<T>(stream, output, input);
-                    return;
-                }
-            }
-
             for (int i = 0; i < outputs.size(); ++i)
             {
                 auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 88ba8ebdd6..e5d7dd92fb 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -2788,7 +2788,13 @@ struct Net::Impl : public detail::NetImplBase
                         if (preferableBackend == DNN_BACKEND_CUDA &&
                             (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                              (inp_i_data->layerInstance->type != "Convolution" &&
-                              inp_i_data->layerInstance->type != "Pooling")))
+                              inp_i_data->layerInstance->type != "Pooling" &&
+                              inp_i_data->layerInstance->type != "Resize"  &&
+                              inp_i_data->layerInstance->type != "Flatten" &&
+                              inp_i_data->layerInstance->type != "Permute" &&
+                              inp_i_data->layerInstance->type != "Reorg" &&
+                              inp_i_data->layerInstance->type != "Eltwise" &&
+                              inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
                         {
                             break;
                         }