From 17c485eb03a8db9a19c4b5b82fcf1a906aa9d6ce Mon Sep 17 00:00:00 2001 From: Yashas Samaga B L Date: Sun, 15 Dec 2019 00:56:58 +0530 Subject: [PATCH] Merge pull request #16092 from YashasSamaga:cuda4dnn-conv-act-fuse cuda4dnn: fuse activations with convolutions * fuse ReLU, ReLU6, TanH, Sigmoid with conv * fix OpenCL errors * improve ReLU, add power, swish and mish * fix missing fusion entries * fix handling of unsetAttached * remove whole file indentation * optimize power = 1.0, use IDENTITY instead of NONE * handle edge case: change backend and then clear --- modules/dnn/src/cuda/bias_activation.cu | 336 ++++++++++++++++++ .../src/cuda4dnn/kernels/bias_activation.hpp | 38 ++ .../src/cuda4dnn/primitives/convolution.hpp | 89 ++++- modules/dnn/src/dnn.cpp | 15 +- modules/dnn/src/layers/convolution_layer.cpp | 71 ++++ 5 files changed, 545 insertions(+), 4 deletions(-) create mode 100644 modules/dnn/src/cuda/bias_activation.cu create mode 100644 modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp diff --git a/modules/dnn/src/cuda/bias_activation.cu b/modules/dnn/src/cuda/bias_activation.cu new file mode 100644 index 0000000000..42161362ee --- /dev/null +++ b/modules/dnn/src/cuda/bias_activation.cu @@ -0,0 +1,336 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "types.hpp" +#include "math.hpp" +#include "vector_traits.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + + template + __global__ void biasN_relu_inplace_vec(Span inplace_output, size_type inner_size, View bias, T slope) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + vec.data[j] += bias[bias_idx]; + vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j]; + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_clipped_relu_inplace_vec(Span inplace_output, size_type inner_size, View bias, T floor, T ceil) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::clamp; + vec.data[j] = clamp(vec.data[j] + bias[bias_idx], floor, ceil); + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_power_inplace_vec(Span inplace_output, size_type inner_size, View bias, T power) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::pow; + vec.data[j] = pow(vec.data[j] + bias[bias_idx], power); + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_tanh_inplace_vec(Span inplace_output, size_type inner_size, View bias) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::tanh; + vec.data[j] = tanh(vec.data[j] + bias[bias_idx]); + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_sigmoid_inplace_vec(Span inplace_output, size_type inner_size, View bias) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::sigmoid; + vec.data[j] = sigmoid(vec.data[j] + bias[bias_idx]); + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_swish_inplace_vec(Span inplace_output, size_type inner_size, View bias) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::sigmoid; + vec.data[j] += bias[bias_idx]; + vec.data[j] = vec.data[j] * sigmoid(vec.data[j]); + } + v_store(inplace_output_vPtr[i], vec); + } + } + + template + __global__ void biasN_mish_inplace_vec(Span inplace_output, size_type inner_size, View bias) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + + inner_size /= vector_type::size(); + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + + vector_type vec; + v_load(vec, inplace_output_vPtr[i]); + for(int j = 0; j < vec.size(); j++) { + using device::tanh; + using device::log1pexp; + vec.data[j] += bias[bias_idx]; + vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j])); + } + v_store(inplace_output_vPtr[i], vec); + } + } +} + +template static +void launch_biasN_relu_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T slope) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_relu_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias, slope); +} + +template +void biasN_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T slope) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, slope); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, slope); + } else { + launch_biasN_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, slope); + } +} + +template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half); +template void biasN_relu_inplace(const Stream&, Span, std::size_t, View, float); + +template static +void launch_biasN_clipped_relu_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T floor, T ceil) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_clipped_relu_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias, floor, ceil); +} + +template +void biasN_clipped_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T floor, T ceil) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_clipped_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, floor, ceil); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_clipped_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, floor, ceil); + } else { + launch_biasN_clipped_relu_inplace_vec_kernel(stream, inplace_output, inner_size, bias, floor, ceil); + } +} + +template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half); +template void biasN_clipped_relu_inplace(const Stream&, Span, std::size_t, View, float, float); + +template static +void launch_biasN_power_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T power) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_power_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias, power); +} + +template +void biasN_power_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T power) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_power_inplace_vec_kernel(stream, inplace_output, inner_size, bias, power); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_power_inplace_vec_kernel(stream, inplace_output, inner_size, bias, power); + } else { + launch_biasN_power_inplace_vec_kernel(stream, inplace_output, inner_size, bias, power); + } +} + +template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half); +template void biasN_power_inplace(const Stream&, Span, std::size_t, View, float); + +template static +void launch_biasN_tanh_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_tanh_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias); +} + +template +void biasN_tanh_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_tanh_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_tanh_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else { + launch_biasN_tanh_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } +} + +template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_tanh_inplace(const Stream&, Span, std::size_t, View); + +template static +void launch_biasN_sigmoid_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_sigmoid_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias); +} + +template +void biasN_sigmoid_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_sigmoid_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_sigmoid_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else { + launch_biasN_sigmoid_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } +} + +template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_sigmoid_inplace(const Stream&, Span, std::size_t, View); + +template static +void launch_biasN_swish_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_swish_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias); +} + +template +void biasN_swish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_swish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_swish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else { + launch_biasN_swish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } +} + +template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_swish_inplace(const Stream&, Span, std::size_t, View); + +template static +void launch_biasN_mish_inplace_vec_kernel(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_mish_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size, bias); +} + +template +void biasN_mish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { + launch_biasN_mish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { + launch_biasN_mish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } else { + launch_biasN_mish_inplace_vec_kernel(stream, inplace_output, inner_size, bias); + } +} + +template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_mish_inplace(const Stream&, Span, std::size_t, View); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp b/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp new file mode 100644 index 0000000000..93660a8c33 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp @@ -0,0 +1,38 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + template + void biasN_relu_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T slope); + + template + void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T floor, T ceiling); + + template + void biasN_power_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T exp); + + template + void biasN_tanh_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + + template + void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + + template + void biasN_swish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + + template + void biasN_mish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp index 6713357d92..72a84deed7 100644 --- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp @@ -12,6 +12,8 @@ #include "../csl/tensor.hpp" #include "../csl/tensor_ops.hpp" #include "../kernels/scale_shift.hpp" +#include "../kernels/activations.hpp" +#include "../kernels/bias_activation.hpp" #include @@ -44,6 +46,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { /* group count for grouped convolution */ std::size_t groups; + + enum class ActivationType { + IDENTITY, + RELU, /* uses value provided in `relu_negative_slope` */ + CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */ + POWER, /* scale and shift fused beforehand (fuseWeights); only `power_exp` is handled by CUDA */ + TANH, + SIGMOID, + SWISH, + MISH + }; + + ActivationType activation_type; + float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; }; template @@ -59,7 +75,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { const auto& strides = config.strides; const auto convolution_order = kernel_size.size(); - CV_Assert(convolution_order >= 1); + CV_Assert(convolution_order > 1); CV_Assert(convolution_order == dilations.size()); CV_Assert(convolution_order == strides.size()); @@ -72,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { const auto groups = config.groups; if (convolution_order > 3) - CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported."); + CV_Error(Error::StsNotImplemented, "Only 2D/3D convolution is supported."); const auto rank = input_shape.size(); const auto output_feature_maps = output_shape[1]; @@ -190,6 +206,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { convoluter = csl::Convolution(cudnnHandle, params); + activation = config.activation_type; + relu_negative_slope = config.relu_negative_slope; + crelu_floor = config.crelu_floor; + crelu_ceil = config.crelu_ceil; + power_exp = config.power_exp; + + if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f) + activation = ConvolutionConfiguration::ActivationType::IDENTITY; + csl::WorkspaceBuilder builder; if (!transformed_shape.empty()) { auto& shape = transformed_shape; @@ -227,7 +252,62 @@ namespace cv { namespace dnn { namespace cuda4dnn { if (!biasTensor.empty()) { std::size_t inner_size = output.size_range(2, output.rank()); - kernels::biasN(stream, output, output, inner_size, biasTensor); + switch(activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + kernels::biasN(stream, output, output, inner_size, biasTensor); + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::biasN_relu_inplace(stream, output, inner_size, biasTensor, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::biasN_clipped_relu_inplace(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::biasN_tanh_inplace(stream, output, inner_size, biasTensor); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::biasN_sigmoid_inplace(stream, output, inner_size, biasTensor); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::biasN_swish_inplace(stream, output, inner_size, biasTensor); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::biasN_mish_inplace(stream, output, inner_size, biasTensor); + break; + } + } + else + { + switch(activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::relu(stream, output, output, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::clipped_relu(stream, output, output, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::power(stream, output, output, power_exp, 1.0, 0.0); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::tanh(stream, output, output); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::sigmoid(stream, output, output); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::swish(stream, output, output); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::mish(stream, output, output); + break; + } } } @@ -243,6 +323,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { csl::TensorTransform inputTransformer; std::size_t scratch_mem_in_bytes; + + ConvolutionConfiguration::ActivationType activation; + float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; }; }}} /* namespace cv::dnn::cuda4dnn */ diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 2de02732d4..2f29c28e8d 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2405,7 +2405,7 @@ struct Net::Impl break; } - if (preferableBackend != DNN_BACKEND_OPENCV) + if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA) continue; // Go to the next layer. // TODO: OpenCL target support more fusion styles. @@ -2415,6 +2415,9 @@ struct Net::Impl ld.layerInstance->type != "Concat")) ) continue; + if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution") + continue; + while (nextData) { // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh @@ -2426,6 +2429,16 @@ struct Net::Impl nextData->type != "Power") break; + if (IS_DNN_CUDA_TARGET(preferableTarget) && + nextData->type != "ReLU" && + nextData->type != "ReLU6" && + nextData->type != "Power" && + nextData->type != "TanH" && + nextData->type != "Sigmoid" && + nextData->type != "Swish" && + nextData->type != "Mish") + break; + Ptr nextActivLayer = nextData->layerInstance.dynamicCast(); if (nextActivLayer.empty()) break; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 632165ae4a..29361ced04 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -239,6 +239,12 @@ public: ocl4dnnFusedActiv_t activType; float power; #endif + +#ifdef HAVE_CUDA + cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType; + float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp; +#endif + ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params) { #ifdef HAVE_OPENCL @@ -246,6 +252,10 @@ public: activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; power = 0.f; #endif + +#ifdef HAVE_CUDA + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY; +#endif } MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE @@ -406,6 +416,61 @@ public: } } #endif + +#ifdef HAVE_CUDA + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY; + + if(IS_DNN_CUDA_TARGET(preferableTarget)) + { + Ptr activ_relu = activ.dynamicCast(); + if(!activ_relu.empty()) + { + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::RELU; + cuda_relu_slope = activ_relu->negativeSlope; + } + + Ptr activ_relu6 = activ.dynamicCast(); + if(!activ_relu6.empty()) + { + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::CLIPPED_RELU; + cuda_crelu_floor = activ_relu6->minValue; + cuda_crelu_ceil = activ_relu6->maxValue; + } + + Ptr activ_power = activ.dynamicCast(); + if (!activ_power.empty()) + { + if (activ_power->scale != 1.f || activ_power->shift != 0.f) + { + const int outCh = blobs[0].size[0]; + fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)), + Mat(1, outCh, CV_32F, Scalar(activ_power->shift))); + } + + cuda_power_exp = activ_power->power; + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER; + } + + Ptr activ_tanh = activ.dynamicCast(); + if(!activ_tanh.empty()) + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::TANH; + + Ptr activ_sigmoid = activ.dynamicCast(); + if(!activ_sigmoid.empty()) + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SIGMOID; + + Ptr activ_swish = activ.dynamicCast(); + if(!activ_swish.empty()) + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SWISH; + + Ptr activ_mish = activ.dynamicCast(); + if(!activ_mish.empty()) + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::MISH; + + if (cudaActType == cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY) + activ.reset(); + } +#endif return !activ.empty(); } @@ -1418,6 +1483,12 @@ public: config.output_shape.assign(std::begin(output_shape), std::end(output_shape)); config.groups = groups; + config.activation_type = cudaActType; + config.relu_negative_slope = cuda_relu_slope; + config.crelu_floor = cuda_crelu_floor; + config.crelu_ceil = cuda_crelu_ceil; + config.power_exp = cuda_power_exp; + Mat filtersMat = fusedWeights ? weightsMat : blobs[0]; Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat(); if (countNonZero(biasMat) == 0)