Merge pull request #16161 from YashasSamaga:cuda4dnn-concat-fusion

cuda4dnn(concat): write outputs from previous layers directly into concat's output * eliminate concat by directly writing to its output buffer * fix concat fusion not happening sometimes * use a whitelist instead of a blacklist
5 years ago · 1f695c4532
parent 77dd40c96c
commit 1f695c4532
2 changed files with 68 additions and 4 deletions
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -2470,7 +2470,9 @@ struct Net::Impl
                     ld.layerInstance->type != "Concat")) )
                    continue;

-                if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution")
+                if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget)
+                    && ld.layerInstance->type != "Convolution"
+                    && ld.layerInstance->type != "Concat")
                    continue;

                while (nextData)
@ -2626,7 +2628,7 @@ struct Net::Impl
                }
            }

-            if (preferableBackend != DNN_BACKEND_OPENCV)
+            if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
                continue;  // Go to the next layer.

            // the optimization #2. if there is concat layer that concatenates channels
@ -2694,6 +2696,15 @@ struct Net::Impl

                        if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                            break;
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA &&
+                            (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
+                             (inp_i_data->layerInstance->type != "Convolution" &&
+                              inp_i_data->layerInstance->type != "Pooling")))
+                        {
+                            break;
+                        }
+#endif
                        realinputs[i] = pin;
                    }

@ -2711,6 +2722,10 @@ struct Net::Impl
                            umats[0] = umat_output;
                            OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
                        }
+#endif
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                            ld.outputBlobsWrappers[0] = wrap(output);
 #endif
                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
                        int ofs = 0;
@ -2735,11 +2750,40 @@ struct Net::Impl
                                umats[pin.oid] = umat_output(chrange);
                                OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
                            }
+#endif
+#ifdef HAVE_CUDA
+                            if (preferableBackend == DNN_BACKEND_CUDA)
+                            {
+                                auto cuda_wrapper = wrap(output).dynamicCast<CUDABackendWrapper>();
+                                auto offset = chrange[1].start * (output.size[2] * output.size[3]);
+                                auto shape = MatShape{1, chrange[1].size(), output.size[2], output.size[3]};
+                                cuda_wrapper->update(shape, offset);
+                                inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast<BackendWrapper>();
+                            }
 #endif
                            // Layers that refer old input Mat will refer to the
                            // new data but the same Mat object.
                            CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
                        }
+
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                        {
+                            for (int i = 0; i < ld.consumers.size(); i++)
+                            {
+                                LayerData& consumer = layers[ld.consumers[i].lid];
+                                for (int j = 0; j < consumer.inputBlobsId.size(); j++)
+                                {
+                                    if (consumer.inputBlobsId[j].lid == ld.id)
+                                    {
+                                        CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data);
+                                        consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+#endif
                        ld.skip = true;
                        printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
                    }
--- a/modules/dnn/src/op_cuda.hpp
+++ b/modules/dnn/src/op_cuda.hpp
@ -217,6 +217,8 @@ namespace cv { namespace dnn {

        /** @note setting the stream updates the stream for all wrappers which use the same tensor */
        virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
+
+        virtual void update(const MatShape& shape, std::size_t offset) = 0;
    };

    namespace cuda4dnn { namespace detail {
@ -276,6 +278,7 @@ namespace cv { namespace dnn {
            : CUDABackendWrapper(TargetID)
        {
            shape = cv::dnn::shape(m);
+            offset = 0;

            shared_block = std::make_shared<shared_block_type>();
            shared_block->host_dirty = true;
@ -300,6 +303,7 @@ namespace cv { namespace dnn {
            CV_Assert(base);

            shape = shape_;
+            offset = 0;
            shared_block = base->shared_block;
        }

@ -313,6 +317,8 @@ namespace cv { namespace dnn {

        void copyToHost() override {
            if (shared_block->device_dirty) {
+                CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
+
                shared_block->host_dirty = false;
                shared_block->device_dirty = false;

@ -339,6 +345,8 @@ namespace cv { namespace dnn {

        void copyToDevice() override {
            if (shared_block->host_dirty) {
+                CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
+
                shared_block->host_dirty = false;
                shared_block->device_dirty = false;

@ -365,13 +373,24 @@ namespace cv { namespace dnn {
            shared_block->stream = std::move(stream);
        }

+        void update(const MatShape& shape_, std::size_t offset_) override {
+            auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies<MatShape::value_type>());
+            if (offset_ + total > shared_block->device.size()) {
+                CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access");
+            }
+            shape = shape_;
+            offset = offset_;
+        }
+
        cv::Mat getMutableHostMat() noexcept {
+            CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
            copyToHost();
            setHostDirty();
            return shared_block->host;
        }

        const cv::Mat getImmutableHostMat() const noexcept {
+            CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
            copyToHost();
            return shared_block->host;
        }
@ -388,12 +407,12 @@ namespace cv { namespace dnn {
         */
        tensor_span_type getSpan() noexcept {
            setDeviceDirty();
-            return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+            return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
        }

        tensor_view_type getView() noexcept {
            copyToDevice();
-            return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+            return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
        }

    private:
@ -407,6 +426,7 @@ namespace cv { namespace dnn {
         */

        MatShape shape;
+        std::size_t offset;

        struct shared_block_type {
            bool host_dirty;