// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP #include "cudnn.hpp" #include "../pointer.hpp" #include "../workspace.hpp" #include #include #include #include #include #include #include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { /** describe convolution filters * * @tparam T type of elements in the kernels */ template class FilterDescriptor { public: FilterDescriptor() noexcept : descriptor{ nullptr } { } FilterDescriptor(const FilterDescriptor&) = delete; FilterDescriptor(FilterDescriptor&& other) noexcept : descriptor{ other.descriptor } { other.descriptor = nullptr; } /** constructs a filter descriptor from the filter dimensions provided in \p shape * * Shape dimensions: * 0: number of filters * 1: number of input feature maps * 2..n: kernel dimensions * * Exception Guarantee: Strong */ template ()))> FilterDescriptor(const SequenceContainer& shape) { constructor(shape.begin(), shape.end()); } /** constructs a filter descriptor from the filter dimensions provided in [begin, end) * * Shape dimensions: * 0: number of filters * 1: number of input feature maps * 2..n: kernel dimensions * * Exception Guarantee: Strong */ template ::value, void>::type> // TODO is_iterator FilterDescriptor(ForwardItr begin, ForwardItr end) { constructor(begin, end); } /** constructs a filter descriptor from the filter dimensions provided as arguments * * Shape dimensions: * 0: number of filters * 1: number of input feature maps * 2..n: kernel dimensions * * Exception Guarantee: Strong */ template FilterDescriptor(Sizes ...sizes) { static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions"); static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); std::array dims = { static_cast(sizes)... }; constructor(std::begin(dims), std::end(dims)); } ~FilterDescriptor() noexcept { if (descriptor != nullptr) { /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */ CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); } } FilterDescriptor& operator=(const FilterDescriptor&) = delete; FilterDescriptor& operator=(FilterDescriptor&& other) noexcept { descriptor = other.descriptor; other.descriptor = nullptr; return *this; }; cudnnFilterDescriptor_t get() const noexcept { return descriptor; } private: template void constructor(ForwardItr start, ForwardItr end) { CV_Assert(start != end); CV_Assert(std::distance(start, end) >= 3); CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor)); try { const auto rank = std::distance(start, end); if (rank == 4) { std::array dims; std::copy(start, end, std::begin(dims)); CUDA4DNN_CHECK_CUDNN( cudnnSetFilter4dDescriptor( descriptor, detail::get_data_type(), CUDNN_TENSOR_NCHW, dims[0], dims[1], dims[2], dims[3] ) ); } else { std::vector dims(start, end); CUDA4DNN_CHECK_CUDNN( cudnnSetFilterNdDescriptor( descriptor, detail::get_data_type(), CUDNN_TENSOR_NCHW, dims.size(), dims.data() ) ); } } catch (...) { /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */ CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); throw; } } cudnnFilterDescriptor_t descriptor; }; /** describes a convolution operation * * @tparam T type of element participating in convolution */ template class ConvolutionDescriptor { public: ConvolutionDescriptor() noexcept : descriptor{ nullptr } { } ConvolutionDescriptor(const ConvolutionDescriptor&) = delete; ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept : descriptor{ other.descriptor } { other.descriptor = nullptr; } /** constructs a convolution descriptor * * Pre-conditions: * - \p zero_padding, \p stride and \p dilation must have the same size * * The length of the containers is interpreted as the order of the convolution. * * Exception Guarantee: Strong */ template ()))> ConvolutionDescriptor( const SequenceContainer& zero_padding, const SequenceContainer& stride, const SequenceContainer& dilation, std::size_t group_count) { constructor(zero_padding, stride, dilation, group_count); } ~ConvolutionDescriptor() noexcept { if (descriptor != nullptr) { /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */ CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); } } ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete; ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept { descriptor = other.descriptor; other.descriptor = nullptr; return *this; }; cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; } private: template void constructor( const SequenceContainer& zero_padding, const SequenceContainer& stride, const SequenceContainer& dilation, std::size_t group_count) { CV_Assert(zero_padding.size() == stride.size()); CV_Assert(zero_padding.size() == dilation.size()); CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); try { const auto rank = zero_padding.size(); if (rank == 2) { CUDA4DNN_CHECK_CUDNN( cudnnSetConvolution2dDescriptor( descriptor, zero_padding[0], zero_padding[1], stride[0], stride[1], dilation[0], dilation[1], CUDNN_CROSS_CORRELATION, detail::get_data_type() ) ); } else { std::vector ipadding(std::begin(zero_padding), std::end(zero_padding)); std::vector istride(std::begin(stride), std::end(stride)); std::vector idilation(std::begin(dilation), std::end(dilation)); CUDA4DNN_CHECK_CUDNN( cudnnSetConvolutionNdDescriptor( descriptor, rank, ipadding.data(), istride.data(), idilation.data(), CUDNN_CROSS_CORRELATION, detail::get_data_type() ) ); } CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); if (std::is_same::value) CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH)); } catch (...) { /* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */ CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); throw; } } cudnnConvolutionDescriptor_t descriptor; }; /** wrapper around a convolution algorithm * * @tparam T type of elements being convolved */ template class ConvolutionAlgorithm { public: ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } ConvolutionAlgorithm(ConvolutionAlgorithm&) = default; ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default; /** selects a good algorithm for convolution for given configuration * * Exception Guarantee: Strong */ ConvolutionAlgorithm( const Handle& handle, const ConvolutionDescriptor& conv, const FilterDescriptor& filter, const TensorDescriptor& input, const TensorDescriptor& output) { CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionForwardAlgorithm( handle.get(), input.get(), filter.get(), conv.get(), output.get(), CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, /* no memory limit */ &algo ) ); CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionForwardWorkspaceSize( handle.get(), input.get(), filter.get(), conv.get(), output.get(), algo, &workspace_size ) ); } ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; } /** number of bytes of workspace memory required by the algorithm */ std::size_t get_workspace_size() const noexcept { return workspace_size; } private: cudnnConvolutionFwdAlgo_t algo; std::size_t workspace_size; }; /** gives the shape of the output tensor of convolution * * Exception Guarantee: Basic */ template void getConvolutionForwardOutputDim( const ConvolutionDescriptor& convDesc, const FilterDescriptor& filterDesc, const TensorDescriptor& inputDesc, std::vector& output) { output.clear(); output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ std::vector temp(CUDNN_DIM_MAX); cudnnDataType_t tempDataType; CUDA4DNN_CHECK_CUDNN( cudnnGetTensorNdDescriptor( inputDesc.get(), CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ &tempDataType, output.data(), temp.data(), temp.data() ) ); const auto rank = output[0]; output.resize(rank); CUDA4DNN_CHECK_CUDNN( cudnnGetConvolutionNdForwardOutputDim( convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data() ) ); } /** @brief performs convolution * * dstValue = alpha * result + beta * priorDstValue * * @tparam T convolution element type (must be `half` or `float`) * * @param handle valid cuDNN Handle * @param convDesc convolution description * @param convAlgo algorithm to use for convolution * @param workspace workspace memory which meets the requirements of \p convAlgo * @param filterDesc filter descriptor * @param[in] filterPtr pointer to device memory containing the filters * @param inputDesc tensor descriptor describing the input * @param[in] inputPtr pointer to input tensor in device memory * @param alpha result scale factor * @param beta previous value scale factor * @param outputDesc tensor descriptor describing the output * @param[out] outputPtr pointer to output tensor in device memory * * Exception Guarantee: Basic */ template void convolve( const Handle& handle, const ConvolutionDescriptor& convDesc, const ConvolutionAlgorithm& convAlgo, WorkspaceInstance workspace, const FilterDescriptor& filterDesc, DevicePtr filterPtr, const TensorDescriptor& inputDesc, DevicePtr inputPtr, T alpha, T beta, const TensorDescriptor& outputDesc, DevicePtr outputPtr) { CV_Assert(handle); CUDA4DNN_CHECK_CUDNN( cudnnConvolutionForward( handle.get(), &alpha, inputDesc.get(), inputPtr.get(), filterDesc.get(), filterPtr.get(), convDesc.get(), convAlgo.get(), static_cast(workspace.get()), workspace.size_in_bytes(), &beta, outputDesc.get(), outputPtr.get() ) ); } template <> inline void convolve( const Handle& handle, const ConvolutionDescriptor& convDesc, const ConvolutionAlgorithm& convAlgo, WorkspaceInstance workspace, const FilterDescriptor& filterDesc, DevicePtr filterPtr, const TensorDescriptor& inputDesc, DevicePtr inputPtr, half alpha, half beta, const TensorDescriptor& outputDesc, DevicePtr outputPtr) { CV_Assert(handle); /* we specalize for fp16 as the scaling factors must be provided as `float` */ float alpha_ = alpha, beta_ = beta; CUDA4DNN_CHECK_CUDNN( cudnnConvolutionForward( handle.get(), &alpha_, inputDesc.get(), inputPtr.get(), filterDesc.get(), filterPtr.get(), convDesc.get(), convAlgo.get(), static_cast(workspace.get()), workspace.size_in_bytes(), &beta_, outputDesc.get(), outputPtr.get() ) ); } }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */