use optimized cuDNN path for conv + bias + relu

pull/16254/head
YashasSamaga 5 years ago
parent 9ec3d76b21
commit 17a35587e1
  1. 80
      modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
  2. 88
      modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
  3. 32
      modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
  4. 138
      modules/dnn/src/cuda4dnn/primitives/convolution.hpp

@ -0,0 +1,80 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
#include <cudnn.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
class ActivationDescriptor {
public:
enum class ActivationType {
IDENTITY,
RELU,
CLIPPED_RELU,
TANH,
SIGMOID,
ELU
};
ActivationDescriptor() noexcept : descriptor{ nullptr } { }
ActivationDescriptor(const ActivationDescriptor&) = delete;
ActivationDescriptor(ActivationDescriptor&& other) noexcept
: descriptor{ other.descriptor } {
other.descriptor = nullptr;
}
/* `relu_ceiling_or_elu_alpha`:
* - `alpha` coefficient in ELU activation
* - `ceiling` for CLIPPED_RELU activation
*/
ActivationDescriptor(ActivationType type, double relu_ceiling_or_elu_alpha = 0.0) {
CUDA4DNN_CHECK_CUDNN(cudnnCreateActivationDescriptor(&descriptor));
try {
const auto mode = [type] {
switch(type) {
case ActivationType::IDENTITY: return CUDNN_ACTIVATION_IDENTITY;
case ActivationType::RELU: return CUDNN_ACTIVATION_RELU;
case ActivationType::CLIPPED_RELU: return CUDNN_ACTIVATION_CLIPPED_RELU;
case ActivationType::SIGMOID: return CUDNN_ACTIVATION_SIGMOID;
case ActivationType::TANH: return CUDNN_ACTIVATION_TANH;
case ActivationType::ELU: return CUDNN_ACTIVATION_ELU;
}
CV_Assert(0);
return CUDNN_ACTIVATION_IDENTITY;
} ();
CUDA4DNN_CHECK_CUDNN(cudnnSetActivationDescriptor(descriptor, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling_or_elu_alpha));
} catch(...) {
/* cudnnDestroyActivationDescriptor will not fail for a valid descriptor object */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
throw;
}
}
~ActivationDescriptor() noexcept {
if (descriptor != nullptr) {
/* cudnnDestroyActivationDescriptor will not fail */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
}
}
ActivationDescriptor& operator=(const ActivationDescriptor&) = delete;
ActivationDescriptor& operator=(ActivationDescriptor&& other) noexcept {
descriptor = other.descriptor;
other.descriptor = nullptr;
return *this;
};
cudnnActivationDescriptor_t get() const noexcept { return descriptor; }
private:
cudnnActivationDescriptor_t descriptor;
};
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP */

@ -6,6 +6,7 @@
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
#include "cudnn.hpp" #include "cudnn.hpp"
#include "activation.hpp"
#include "../pointer.hpp" #include "../pointer.hpp"
#include "../workspace.hpp" #include "../workspace.hpp"
@ -405,6 +406,93 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
); );
} }
/** @brief performs convolution, bias addition and activation simultaneously
*
* dstValue = act(alpha * conv(input) + bias)
*
* @tparam T convolution element type (must be `half` or `float`)
*
* @param handle valid cuDNN Handle
* @param convDesc convolution description
* @param convAlgo algorithm to use for convolution
* @param workspace workspace memory which meets the requirements of \p convAlgo
* @param filterDesc filter descriptor
* @param[in] filterPtr pointer to device memory containing the filters
* @param alpha convolution scale factor
* @param inputDesc tensor descriptor describing the input
* @param[in] inputPtr pointer to input tensor in device memory
* @param biasDesc tensor descriptor describing the bias
* @param[in] biasPtr pointer to bias tensor in device memory
* @param actDesc activation descriptor
* @param outputDesc tensor descriptor describing the output
* @param[out] outputPtr pointer to output tensor in device memory
*
* Exception Guarantee: Basic
*/
template <class T>
void convolve_with_bias_activation(
const Handle& handle,
T alpha,
const ConvolutionDescriptor<T>& convDesc,
const ConvolutionAlgorithm<T>& convAlgo,
WorkspaceInstance workspace,
const FilterDescriptor<T>& filterDesc,
DevicePtr<const T> filterPtr,
const TensorDescriptor<T>& inputDesc,
DevicePtr<const T> inputPtr,
const TensorDescriptor<T>& biasDesc,
DevicePtr<const T> biasPtr,
const ActivationDescriptor& actDesc,
const TensorDescriptor<T>& outputDesc,
DevicePtr<T> outputPtr)
{
CV_Assert(handle);
T alpha2 = 0.0;
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
handle.get(),
&alpha, inputDesc.get(), inputPtr.get(),
filterDesc.get(), filterPtr.get(),
convDesc.get(), convAlgo.get(),
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
&alpha2, outputDesc.get(), outputPtr.get(),
biasDesc.get(), biasPtr.get(),
actDesc.get(),
outputDesc.get(), outputPtr.get()));
}
template <> inline
void convolve_with_bias_activation(
const Handle& handle,
half alpha,
const ConvolutionDescriptor<half>& convDesc,
const ConvolutionAlgorithm<half>& convAlgo,
WorkspaceInstance workspace,
const FilterDescriptor<half>& filterDesc,
DevicePtr<const half> filterPtr,
const TensorDescriptor<half>& inputDesc,
DevicePtr<const half> inputPtr,
const TensorDescriptor<half>& biasDesc,
DevicePtr<const half> biasPtr,
const ActivationDescriptor& actDesc,
const TensorDescriptor<half>& outputDesc,
DevicePtr<half> outputPtr)
{
CV_Assert(handle);
float alpha_ = alpha, alpha2 = 0.0;
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
handle.get(),
&alpha_, inputDesc.get(), inputPtr.get(),
filterDesc.get(), filterPtr.get(),
convDesc.get(), convAlgo.get(),
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
&alpha2, outputDesc.get(), outputPtr.get(),
biasDesc.get(), biasPtr.get(),
actDesc.get(),
outputDesc.get(), outputPtr.get()));
}
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */

@ -135,17 +135,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
using FilterDescriptor = cudnn::FilterDescriptor<T>; using FilterDescriptor = cudnn::FilterDescriptor<T>;
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>; using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>; using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
using ActivationDescriptor = cudnn::ActivationDescriptor;
public: public:
using ActivationType = ActivationDescriptor::ActivationType;
struct params_type { struct params_type {
/* convolution */
std::vector<std::size_t> input_shape; std::vector<std::size_t> input_shape;
std::vector<std::size_t> filter_shape; std::vector<std::size_t> filter_shape;
std::vector<std::size_t> padding; std::vector<std::size_t> padding;
std::vector<std::size_t> stride; std::vector<std::size_t> stride;
std::vector<std::size_t> dilation; std::vector<std::size_t> dilation;
std::size_t groups; std::size_t groups;
/* bias and activation (only RELU supported) */
std::vector<std::size_t> bias_shape;
ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */
}; };
Convolution() = default; Convolution() = default;
@ -158,6 +164,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
filterDesc = FilterDescriptor(params.filter_shape); filterDesc = FilterDescriptor(params.filter_shape);
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups); convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
if (!params.bias_shape.empty()) {
CV_Assert(params.activation_type == ActivationType::RELU);
biasTensorDesc = TensorDescriptor(params.bias_shape);
activationDesc = ActivationDescriptor(params.activation_type, 0.0);
} else {
CV_Assert(params.activation_type == ActivationType::IDENTITY);
}
std::vector<int> output_dims; std::vector<int> output_dims;
getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
outputTensorDesc = TensorDescriptor(output_dims); outputTensorDesc = TensorDescriptor(output_dims);
@ -182,12 +196,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
); );
} }
void convolve_with_bias_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, WorkspaceInstance scratchpad) {
cudnn::convolve_with_bias_activation<T>(
cudnnHandle,
1.0, convDesc, algo, scratchpad,
filterDesc, filters.get(),
inputTensorDesc, input.get(),
biasTensorDesc, bias.get(),
activationDesc,
outputTensorDesc, output.get()
);
}
private: private:
cudnn::Handle cudnnHandle; cudnn::Handle cudnnHandle;
TensorDescriptor inputTensorDesc, outputTensorDesc; TensorDescriptor inputTensorDesc, outputTensorDesc;
FilterDescriptor filterDesc; FilterDescriptor filterDesc;
ConvolutionDescriptor convDesc; ConvolutionDescriptor convDesc;
ConvolutionAlgorithm algo; ConvolutionAlgorithm algo;
TensorDescriptor biasTensorDesc;
ActivationDescriptor activationDesc;
}; };
template <class T> template <class T>

@ -204,6 +204,21 @@ namespace cv { namespace dnn { namespace cuda4dnn {
params.dilation = dilations; params.dilation = dilations;
params.groups = config.groups; params.groups = config.groups;
/* check if we can perform fused convolution using cudnn */
params.activation_type = csl::Convolution<T>::ActivationType::IDENTITY;
fusion_location = InternalFusionLocation::NATIVE;
if (!biasTensor.empty() &&
biasTensor.size() == output_feature_maps && /* cuDNN requirement */
config.activation_type == ConvolutionConfiguration::ActivationType::RELU &&
config.relu_negative_slope == 0.0)
{
fusion_location = InternalFusionLocation::CUDNN;
auto bias_shape = std::vector<std::size_t>(rank, 1);
bias_shape[1] = output_feature_maps;
params.bias_shape = bias_shape;
params.activation_type = csl::Convolution<T>::ActivationType::RELU;
}
convoluter = csl::Convolution<T>(cudnnHandle, params); convoluter = csl::Convolution<T>(cudnnHandle, params);
activation = config.activation_type; activation = config.activation_type;
@ -216,7 +231,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
activation = ConvolutionConfiguration::ActivationType::IDENTITY; activation = ConvolutionConfiguration::ActivationType::IDENTITY;
csl::WorkspaceBuilder builder; csl::WorkspaceBuilder builder;
if (!transformed_shape.empty()) { if (!transformed_shape.empty())
{
auto& shape = transformed_shape; auto& shape = transformed_shape;
auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>()); auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
builder.require<T>(sz); builder.require<T>(sz);
@ -248,65 +264,72 @@ namespace cv { namespace dnn { namespace cuda4dnn {
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
auto output = output_wrapper->getSpan(); auto output = output_wrapper->getSpan();
convoluter.convolve(output, input, filtersTensor, allocator.get_instance()); if (fusion_location == InternalFusionLocation::CUDNN)
if (!biasTensor.empty())
{ {
std::size_t inner_size = output.size_range(2, output.rank()); convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, allocator.get_instance());
switch(activation)
{
case ConvolutionConfiguration::ActivationType::IDENTITY:
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::RELU:
kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
break;
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::SIGMOID:
kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::SWISH:
kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::MISH:
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
break;
}
} }
else else
{ {
switch(activation) convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
if (!biasTensor.empty())
{ {
case ConvolutionConfiguration::ActivationType::IDENTITY: std::size_t inner_size = output.size_range(2, output.rank());
break; switch(activation)
case ConvolutionConfiguration::ActivationType::RELU: {
kernels::relu<T>(stream, output, output, relu_negative_slope); case ConvolutionConfiguration::ActivationType::IDENTITY:
break; kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: break;
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil); case ConvolutionConfiguration::ActivationType::RELU:
break; kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
case ConvolutionConfiguration::ActivationType::POWER: break;
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0); case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
break; kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
case ConvolutionConfiguration::ActivationType::TANH: break;
kernels::tanh<T>(stream, output, output); case ConvolutionConfiguration::ActivationType::POWER:
break; kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
case ConvolutionConfiguration::ActivationType::SIGMOID: break;
kernels::sigmoid<T>(stream, output, output); case ConvolutionConfiguration::ActivationType::TANH:
break; kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
case ConvolutionConfiguration::ActivationType::SWISH: break;
kernels::swish<T>(stream, output, output); case ConvolutionConfiguration::ActivationType::SIGMOID:
break; kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
case ConvolutionConfiguration::ActivationType::MISH: break;
kernels::mish<T>(stream, output, output); case ConvolutionConfiguration::ActivationType::SWISH:
break; kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::MISH:
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
break;
}
}
else
{
switch(activation)
{
case ConvolutionConfiguration::ActivationType::IDENTITY:
break;
case ConvolutionConfiguration::ActivationType::RELU:
kernels::relu<T>(stream, output, output, relu_negative_slope);
break;
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::tanh<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::SIGMOID:
kernels::sigmoid<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::SWISH:
kernels::swish<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::MISH:
kernels::mish<T>(stream, output, output);
break;
}
} }
} }
} }
@ -326,6 +349,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {
ConvolutionConfiguration::ActivationType activation; ConvolutionConfiguration::ActivationType activation;
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
enum class InternalFusionLocation {
CUDNN,
NATIVE
} fusion_location;
}; };
}}} /* namespace cv::dnn::cuda4dnn */ }}} /* namespace cv::dnn::cuda4dnn */

Loading…
Cancel
Save