use optimized cuDNN path for conv + bias + relu

5 years ago · 17a35587e1
parent 9ec3d76b21
commit 17a35587e1
4 changed files with 281 additions and 57 deletions
--- a/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
+
+#include <cudnn.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    class ActivationDescriptor {
+    public:
+        enum class ActivationType {
+            IDENTITY,
+            RELU,
+            CLIPPED_RELU,
+            TANH,
+            SIGMOID,
+            ELU
+        };
+
+        ActivationDescriptor() noexcept : descriptor{ nullptr } { }
+        ActivationDescriptor(const ActivationDescriptor&) = delete;
+        ActivationDescriptor(ActivationDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /* `relu_ceiling_or_elu_alpha`:
+         * - `alpha` coefficient in ELU activation
+         * - `ceiling` for CLIPPED_RELU activation
+         */
+        ActivationDescriptor(ActivationType type, double relu_ceiling_or_elu_alpha = 0.0) {
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateActivationDescriptor(&descriptor));
+            try {
+                const auto mode = [type] {
+                    switch(type) {
+                        case ActivationType::IDENTITY: return CUDNN_ACTIVATION_IDENTITY;
+                        case ActivationType::RELU: return CUDNN_ACTIVATION_RELU;
+                        case ActivationType::CLIPPED_RELU: return CUDNN_ACTIVATION_CLIPPED_RELU;
+                        case ActivationType::SIGMOID: return CUDNN_ACTIVATION_SIGMOID;
+                        case ActivationType::TANH: return CUDNN_ACTIVATION_TANH;
+                        case ActivationType::ELU: return CUDNN_ACTIVATION_ELU;
+                    }
+                    CV_Assert(0);
+                    return CUDNN_ACTIVATION_IDENTITY;
+                } ();
+
+                CUDA4DNN_CHECK_CUDNN(cudnnSetActivationDescriptor(descriptor, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling_or_elu_alpha));
+            } catch(...) {
+                /* cudnnDestroyActivationDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        ~ActivationDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyActivationDescriptor will not fail */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
+            }
+        }
+
+        ActivationDescriptor& operator=(const ActivationDescriptor&) = delete;
+        ActivationDescriptor& operator=(ActivationDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnActivationDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        cudnnActivationDescriptor_t descriptor;
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP */
--- a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
@ -6,6 +6,7 @@
 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP

 #include "cudnn.hpp"
+#include "activation.hpp"

 #include "../pointer.hpp"
 #include "../workspace.hpp"
@ -405,6 +406,93 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
        );
    }

+    /** @brief performs convolution, bias addition and activation simultaneously
+     *
+     * dstValue = act(alpha * conv(input) + bias)
+     *
+     * @tparam          T           convolution element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           convDesc    convolution description
+     * @param           convAlgo    algorithm to use for convolution
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     * @param           filterDesc  filter descriptor
+     * @param[in]       filterPtr   pointer to device memory containing the filters
+     * @param           alpha       convolution scale factor
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           biasDesc    tensor descriptor describing the bias
+     * @param[in]       biasPtr     pointer to bias tensor in device memory
+     * @param           actDesc     activation descriptor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void convolve_with_bias_activation(
+        const Handle& handle,
+        T alpha,
+        const ConvolutionDescriptor<T>& convDesc,
+        const ConvolutionAlgorithm<T>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        const TensorDescriptor<T>& biasDesc,
+        DevicePtr<const T> biasPtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        T alpha2 = 0.0;
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2, outputDesc.get(), outputPtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
+    template <> inline
+    void convolve_with_bias_activation(
+        const Handle& handle,
+        half alpha,
+        const ConvolutionDescriptor<half>& convDesc,
+        const ConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        const TensorDescriptor<half>& biasDesc,
+        DevicePtr<const half> biasPtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        float alpha_ = alpha, alpha2 = 0.0;
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha_, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2, outputDesc.get(), outputPtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */

 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */
--- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
@ -135,17 +135,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
        using FilterDescriptor = cudnn::FilterDescriptor<T>;
        using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
        using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
+        using ActivationDescriptor = cudnn::ActivationDescriptor;

    public:
+        using ActivationType = ActivationDescriptor::ActivationType;
+
        struct params_type {
+            /* convolution */
            std::vector<std::size_t> input_shape;
            std::vector<std::size_t> filter_shape;
-
            std::vector<std::size_t> padding;
            std::vector<std::size_t> stride;
            std::vector<std::size_t> dilation;
-
            std::size_t groups;
+
+            /* bias and activation (only RELU supported) */
+            std::vector<std::size_t> bias_shape;
+            ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */
        };

        Convolution() = default;
@ -158,6 +164,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
            filterDesc = FilterDescriptor(params.filter_shape);
            convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);

+            if (!params.bias_shape.empty()) {
+                CV_Assert(params.activation_type == ActivationType::RELU);
+                biasTensorDesc = TensorDescriptor(params.bias_shape);
+                activationDesc = ActivationDescriptor(params.activation_type, 0.0);
+            } else {
+                CV_Assert(params.activation_type == ActivationType::IDENTITY);
+            }
+
            std::vector<int> output_dims;
            getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
            outputTensorDesc = TensorDescriptor(output_dims);
@ -182,12 +196,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
            );
        }

+        void convolve_with_bias_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, WorkspaceInstance scratchpad) {
+            cudnn::convolve_with_bias_activation<T>(
+                cudnnHandle,
+                1.0, convDesc, algo, scratchpad,
+                filterDesc, filters.get(),
+                inputTensorDesc, input.get(),
+                biasTensorDesc, bias.get(),
+                activationDesc,
+                outputTensorDesc, output.get()
+            );
+        }
+
    private:
        cudnn::Handle cudnnHandle;
        TensorDescriptor inputTensorDesc, outputTensorDesc;
        FilterDescriptor filterDesc;
        ConvolutionDescriptor convDesc;
        ConvolutionAlgorithm algo;
+        TensorDescriptor biasTensorDesc;
+        ActivationDescriptor activationDesc;
    };

    template <class T>
--- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@ -204,6 +204,21 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            params.dilation = dilations;
            params.groups = config.groups;

+            /* check if we can perform fused convolution using cudnn */
+            params.activation_type = csl::Convolution<T>::ActivationType::IDENTITY;
+            fusion_location = InternalFusionLocation::NATIVE;
+            if (!biasTensor.empty() &&
+                biasTensor.size() == output_feature_maps && /* cuDNN requirement */
+                config.activation_type == ConvolutionConfiguration::ActivationType::RELU &&
+                config.relu_negative_slope == 0.0)
+            {
+                fusion_location = InternalFusionLocation::CUDNN;
+                auto bias_shape = std::vector<std::size_t>(rank, 1);
+                bias_shape[1] = output_feature_maps;
+                params.bias_shape = bias_shape;
+                params.activation_type = csl::Convolution<T>::ActivationType::RELU;
+            }
+
            convoluter = csl::Convolution<T>(cudnnHandle, params);

            activation = config.activation_type;
@ -216,7 +231,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                activation = ConvolutionConfiguration::ActivationType::IDENTITY;

            csl::WorkspaceBuilder builder;
-            if (!transformed_shape.empty()) {
+            if (!transformed_shape.empty())
+            {
                auto& shape = transformed_shape;
                auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
                builder.require<T>(sz);
@ -248,65 +264,72 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
            auto output = output_wrapper->getSpan();

-            convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
-            if (!biasTensor.empty())
+            if (fusion_location == InternalFusionLocation::CUDNN)
            {
-                std::size_t inner_size = output.size_range(2, output.rank());
-                switch(activation)
-                {
-                    case ConvolutionConfiguration::ActivationType::IDENTITY:
-                        kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::RELU:
-                        kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
-                        kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::POWER:
-                        kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::TANH:
-                        kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::SIGMOID:
-                        kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::SWISH:
-                        kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::MISH:
-                        kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
-                        break;
-                }
+                convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, allocator.get_instance());
            }
            else
            {
-                switch(activation)
+                convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
+                if (!biasTensor.empty())
                {
-                    case ConvolutionConfiguration::ActivationType::IDENTITY:
-                        break;
-                    case ConvolutionConfiguration::ActivationType::RELU:
-                        kernels::relu<T>(stream, output, output, relu_negative_slope);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
-                        kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::POWER:
-                        kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::TANH:
-                        kernels::tanh<T>(stream, output, output);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::SIGMOID:
-                        kernels::sigmoid<T>(stream, output, output);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::SWISH:
-                        kernels::swish<T>(stream, output, output);
-                        break;
-                    case ConvolutionConfiguration::ActivationType::MISH:
-                        kernels::mish<T>(stream, output, output);
-                        break;
+                    std::size_t inner_size = output.size_range(2, output.rank());
+                    switch(activation)
+                    {
+                        case ConvolutionConfiguration::ActivationType::IDENTITY:
+                            kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::RELU:
+                            kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
+                            kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::POWER:
+                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::TANH:
+                            kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::SIGMOID:
+                            kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::SWISH:
+                            kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::MISH:
+                            kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
+                            break;
+                    }
+                }
+                else
+                {
+                    switch(activation)
+                    {
+                        case ConvolutionConfiguration::ActivationType::IDENTITY:
+                            break;
+                        case ConvolutionConfiguration::ActivationType::RELU:
+                            kernels::relu<T>(stream, output, output, relu_negative_slope);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
+                            kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::POWER:
+                            kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::TANH:
+                            kernels::tanh<T>(stream, output, output);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::SIGMOID:
+                            kernels::sigmoid<T>(stream, output, output);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::SWISH:
+                            kernels::swish<T>(stream, output, output);
+                            break;
+                        case ConvolutionConfiguration::ActivationType::MISH:
+                            kernels::mish<T>(stream, output, output);
+                            break;
+                    }
                }
            }
        }
@ -326,6 +349,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {

        ConvolutionConfiguration::ActivationType activation;
        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+
+        enum class InternalFusionLocation {
+            CUDNN,
+            NATIVE
+        } fusion_location;
    };

 }}} /* namespace cv::dnn::cuda4dnn */