From cbdaa93e548ff13e28d982971f05f5d510d88a4b Mon Sep 17 00:00:00 2001 From: YashasSamaga Date: Sun, 5 Jul 2020 20:52:35 +0530 Subject: [PATCH] reduce slice, concat to copy; enable more concat fusions --- modules/dnn/src/cuda/concat.cu | 16 ++++++++++ modules/dnn/src/cuda/slice.cu | 32 +++++++++++++++++++ modules/dnn/src/cuda4dnn/primitives/slice.hpp | 14 -------- modules/dnn/src/dnn.cpp | 8 ++++- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 7d5955c6a2..ac1be75682 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -16,6 +16,8 @@ #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/kernels/fill_copy.hpp" + #include #include @@ -95,6 +97,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { TensorSpan output, std::size_t output_axis_offset, TensorView input, std::size_t axis) { + CV_Assert(output.rank() == input.rank()); + CV_Assert(output_axis_offset < output.get_axis_size(axis)); + + /* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous + * in the output and we can copy each block directly + */ + if (output.size_range(0, axis) == 1) + { + auto stride = output.size_range(axis + 1, output.rank()); + auto sliced_output = Span(output.get() + output_axis_offset * stride, input.size()); + kernels::copy(stream, sliced_output, input); + return; + } + /* let's call the axis of interest as the channel axis for the purpose of the following discussion * even though it can be any axis * diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu index 5375345bd8..37b718cd63 100644 --- a/modules/dnn/src/cuda/slice.cu +++ b/modules/dnn/src/cuda/slice.cu @@ -15,11 +15,14 @@ #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/kernels/fill_copy.hpp" + #include #include #include #include +#include using namespace cv::dnn::cuda4dnn::csl; using namespace cv::dnn::cuda4dnn::csl::device; @@ -79,6 +82,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { CV_Assert(output.rank() == input.rank()); CV_Assert(output.rank() == offsets.size()); + /* copy directly if no slicing is required */ + if (is_shape_same(output, input)) + { + CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; })); + kernels::copy(stream, output, input); + return; + } + /* squeezable axes at the beginning of both tensors can be eliminated * * Reasoning: @@ -146,6 +157,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { auto rank = inShape.size(); + /* We can do a copy if the reduced rank is two and only the first axis is sliced. + * The general requirement is that only one axis is sliced and all the axes that + * preceed the sliced axis are singleton. However, the reductions above will remove + * all the leading singleton axes and merge the trailing unsliced axes into one, or + * zero if there are no trailing unsliced axes. The latter is handled separately. + */ + if (rank == 2 && offsets[0] != 0 && offsets[1] == 0) + { + auto stride = inShape[1]; + auto sliced_input = View(input.get() + offsets[0] * stride, output.size()); + kernels::copy(stream, output, sliced_input); + return; + } + + if (rank == 1) + { + auto sliced_input = View(input.get() + offsets[0], output.size()); + kernels::copy(stream, output, sliced_input); + return; + } + std::vector inStride(rank), outStride(rank); inStride.back() = 1; outStride.back() = 1; diff --git a/modules/dnn/src/cuda4dnn/primitives/slice.hpp b/modules/dnn/src/cuda4dnn/primitives/slice.hpp index 900b580b45..f83d4adf44 100644 --- a/modules/dnn/src/cuda4dnn/primitives/slice.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/slice.hpp @@ -47,20 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { CV_Assert(offsets.size() == outputs.size()); - /* one output with the same shape as the input => direct copy */ - if (outputs.size() == 1) - { - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (is_shape_same(output, input)) - { - CV_Assert(std::all_of(std::begin(offsets[0]), std::end(offsets[0]), [] (std::size_t x) { return x == 0; })); - kernels::copy(stream, output, input); - return; - } - } - for (int i = 0; i < outputs.size(); ++i) { auto output_wrapper = outputs[i].dynamicCast(); diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 88ba8ebdd6..e5d7dd92fb 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2788,7 +2788,13 @@ struct Net::Impl : public detail::NetImplBase if (preferableBackend == DNN_BACKEND_CUDA && (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false || (inp_i_data->layerInstance->type != "Convolution" && - inp_i_data->layerInstance->type != "Pooling"))) + inp_i_data->layerInstance->type != "Pooling" && + inp_i_data->layerInstance->type != "Resize" && + inp_i_data->layerInstance->type != "Flatten" && + inp_i_data->layerInstance->type != "Permute" && + inp_i_data->layerInstance->type != "Reorg" && + inp_i_data->layerInstance->type != "Eltwise" && + inp_i_data->layerInstance.dynamicCast().empty()))) { break; }