Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

410 lines
15 KiB

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
#include "cudnn.hpp"
#include "../pointer.hpp"
#include "../workspace.hpp"
#include <cudnn.h>
#include <cstddef>
#include <array>
#include <algorithm>
#include <vector>
#include <type_traits>
#include <iterator>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
/** describe convolution filters
*
* @tparam T type of elements in the kernels
*/
template <class T>
class FilterDescriptor {
public:
FilterDescriptor() noexcept : descriptor{ nullptr } { }
FilterDescriptor(const FilterDescriptor&) = delete;
FilterDescriptor(FilterDescriptor&& other) noexcept
: descriptor{ other.descriptor } {
other.descriptor = nullptr;
}
/** constructs a filter descriptor from the filter dimensions provided in \p shape
*
* Shape dimensions:
* 0: number of filters
* 1: number of input feature maps
* 2..n: kernel dimensions
*
* Exception Guarantee: Strong
*/
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
FilterDescriptor(const SequenceContainer& shape) {
constructor(shape.begin(), shape.end());
}
/** constructs a filter descriptor from the filter dimensions provided in [begin, end)
*
* Shape dimensions:
* 0: number of filters
* 1: number of input feature maps
* 2..n: kernel dimensions
*
* Exception Guarantee: Strong
*/
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
FilterDescriptor(ForwardItr begin, ForwardItr end) {
constructor(begin, end);
}
/** constructs a filter descriptor from the filter dimensions provided as arguments
*
* Shape dimensions:
* 0: number of filters
* 1: number of input feature maps
* 2..n: kernel dimensions
*
* Exception Guarantee: Strong
*/
template <class ...Sizes>
FilterDescriptor(Sizes ...sizes) {
static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
constructor(std::begin(dims), std::end(dims));
}
~FilterDescriptor() noexcept {
if (descriptor != nullptr) {
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
}
}
FilterDescriptor& operator=(const FilterDescriptor&) = delete;
FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
descriptor = other.descriptor;
other.descriptor = nullptr;
return *this;
};
cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
private:
template <class ForwardItr>
void constructor(ForwardItr start, ForwardItr end) {
CV_Assert(start != end);
CV_Assert(std::distance(start, end) >= 3);
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
try {
const auto rank = std::distance(start, end);
if (rank == 4) {
std::array<int, 4> dims;
std::copy(start, end, std::begin(dims));
CUDA4DNN_CHECK_CUDNN(
cudnnSetFilter4dDescriptor(
descriptor,
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
dims[0], dims[1], dims[2], dims[3]
)
);
} else {
std::vector<int> dims(start, end);
CUDA4DNN_CHECK_CUDNN(
cudnnSetFilterNdDescriptor(
descriptor,
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
dims.size(), dims.data()
)
);
}
} catch (...) {
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
throw;
}
}
cudnnFilterDescriptor_t descriptor;
};
/** describes a convolution operation
*
* @tparam T type of element participating in convolution
*/
template <class T>
class ConvolutionDescriptor {
public:
ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
: descriptor{ other.descriptor } {
other.descriptor = nullptr;
}
/** constructs a convolution descriptor
*
* Pre-conditions:
* - \p zero_padding, \p stride and \p dilation must have the same size
*
* The length of the containers is interpreted as the order of the convolution.
*
* Exception Guarantee: Strong
*/
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
ConvolutionDescriptor(
const SequenceContainer& zero_padding,
const SequenceContainer& stride,
const SequenceContainer& dilation,
std::size_t group_count)
{
constructor(zero_padding, stride, dilation, group_count);
}
~ConvolutionDescriptor() noexcept {
if (descriptor != nullptr) {
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
}
}
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
descriptor = other.descriptor;
other.descriptor = nullptr;
return *this;
};
cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
private:
template <class SequenceContainer>
void constructor(
const SequenceContainer& zero_padding,
const SequenceContainer& stride,
const SequenceContainer& dilation,
std::size_t group_count)
{
CV_Assert(zero_padding.size() == stride.size());
CV_Assert(zero_padding.size() == dilation.size());
CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
try {
const auto rank = zero_padding.size();
if (rank == 2) {
CUDA4DNN_CHECK_CUDNN(
cudnnSetConvolution2dDescriptor(
descriptor,
zero_padding[0], zero_padding[1],
stride[0], stride[1],
dilation[0], dilation[1],
CUDNN_CROSS_CORRELATION,
detail::get_data_type<T>()
)
);
} else {
std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
std::vector<int> istride(std::begin(stride), std::end(stride));
std::vector<int> idilation(std::begin(dilation), std::end(dilation));
CUDA4DNN_CHECK_CUDNN(
cudnnSetConvolutionNdDescriptor(
descriptor,
rank, ipadding.data(), istride.data(), idilation.data(),
CUDNN_CROSS_CORRELATION,
detail::get_data_type<T>()
)
);
}
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
if (std::is_same<T, half>::value)
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
} catch (...) {
/* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
throw;
}
}
cudnnConvolutionDescriptor_t descriptor;
};
/** wrapper around a convolution algorithm
*
* @tparam T type of elements being convolved
*/
template <class T>
class ConvolutionAlgorithm {
public:
ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
/** selects a good algorithm for convolution for given configuration
*
* Exception Guarantee: Strong
*/
ConvolutionAlgorithm(
const Handle& handle,
const ConvolutionDescriptor<T>& conv,
const FilterDescriptor<T>& filter,
const TensorDescriptor<T>& input,
const TensorDescriptor<T>& output)
{
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardAlgorithm(
handle.get(),
input.get(), filter.get(), conv.get(), output.get(),
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0, /* no memory limit */
&algo
)
);
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionForwardWorkspaceSize(
handle.get(),
input.get(), filter.get(), conv.get(), output.get(),
algo, &workspace_size
)
);
}
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
/** number of bytes of workspace memory required by the algorithm */
std::size_t get_workspace_size() const noexcept { return workspace_size; }
private:
cudnnConvolutionFwdAlgo_t algo;
std::size_t workspace_size;
};
/** gives the shape of the output tensor of convolution
*
* Exception Guarantee: Basic
*/
template <class T>
void getConvolutionForwardOutputDim(
const ConvolutionDescriptor<T>& convDesc,
const FilterDescriptor<T>& filterDesc,
const TensorDescriptor<T>& inputDesc,
std::vector<int>& output)
{
output.clear();
output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
std::vector<int> temp(CUDNN_DIM_MAX);
cudnnDataType_t tempDataType;
CUDA4DNN_CHECK_CUDNN(
cudnnGetTensorNdDescriptor(
inputDesc.get(),
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
&tempDataType,
output.data(),
temp.data(),
temp.data()
)
);
const auto rank = output[0];
output.resize(rank);
CUDA4DNN_CHECK_CUDNN(
cudnnGetConvolutionNdForwardOutputDim(
convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
)
);
}
/** @brief performs convolution
*
* dstValue = alpha * result + beta * priorDstValue
*
* @tparam T convolution element type (must be `half` or `float`)
*
* @param handle valid cuDNN Handle
* @param convDesc convolution description
* @param convAlgo algorithm to use for convolution
* @param workspace workspace memory which meets the requirements of \p convAlgo
* @param filterDesc filter descriptor
* @param[in] filterPtr pointer to device memory containing the filters
* @param inputDesc tensor descriptor describing the input
* @param[in] inputPtr pointer to input tensor in device memory
* @param alpha result scale factor
* @param beta previous value scale factor
* @param outputDesc tensor descriptor describing the output
* @param[out] outputPtr pointer to output tensor in device memory
*
* Exception Guarantee: Basic
*/
template <class T>
void convolve(
const Handle& handle,
const ConvolutionDescriptor<T>& convDesc,
const ConvolutionAlgorithm<T>& convAlgo,
WorkspaceInstance workspace,
const FilterDescriptor<T>& filterDesc,
DevicePtr<const T> filterPtr,
const TensorDescriptor<T>& inputDesc,
DevicePtr<const T> inputPtr,
T alpha, T beta,
const TensorDescriptor<T>& outputDesc,
DevicePtr<T> outputPtr)
{
CV_Assert(handle);
CUDA4DNN_CHECK_CUDNN(
cudnnConvolutionForward(
handle.get(),
&alpha, inputDesc.get(), inputPtr.get(),
filterDesc.get(), filterPtr.get(),
convDesc.get(), convAlgo.get(),
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
&beta, outputDesc.get(), outputPtr.get()
)
);
}
template <> inline
void convolve(
const Handle& handle,
const ConvolutionDescriptor<half>& convDesc,
const ConvolutionAlgorithm<half>& convAlgo,
WorkspaceInstance workspace,
const FilterDescriptor<half>& filterDesc,
DevicePtr<const half> filterPtr,
const TensorDescriptor<half>& inputDesc,
DevicePtr<const half> inputPtr,
half alpha, half beta,
const TensorDescriptor<half>& outputDesc,
DevicePtr<half> outputPtr)
{
CV_Assert(handle);
/* we specalize for fp16 as the scaling factors must be provided as `float` */
float alpha_ = alpha, beta_ = beta;
CUDA4DNN_CHECK_CUDNN(
cudnnConvolutionForward(
handle.get(),
&alpha_, inputDesc.get(), inputPtr.get(),
filterDesc.get(), filterPtr.get(),
convDesc.get(), convAlgo.get(),
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
&beta_, outputDesc.get(), outputPtr.get()
)
);
}
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */