|
|
|
@ -225,6 +225,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); |
|
|
|
|
|
|
|
|
|
#if CUDNN_MAJOR >= 8 |
|
|
|
|
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
|
|
|
|
|
* in the default setting. TF32 convolutions have lower precision than FP32. |
|
|
|
|
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior. |
|
|
|
|
*/ |
|
|
|
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH)); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (std::is_same<T, half>::value) |
|
|
|
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH)); |
|
|
|
|
} catch (...) { |
|
|
|
@ -254,15 +263,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu |
|
|
|
|
*/ |
|
|
|
|
ConvolutionAlgorithm( |
|
|
|
|
const Handle& handle, |
|
|
|
|
const ConvolutionDescriptor<T>& conv, |
|
|
|
|
const FilterDescriptor<T>& filter, |
|
|
|
|
const TensorDescriptor<T>& input, |
|
|
|
|
const TensorDescriptor<T>& output) |
|
|
|
|
const ConvolutionDescriptor<T>& convDesc, |
|
|
|
|
const FilterDescriptor<T>& filterDesc, |
|
|
|
|
const TensorDescriptor<T>& inputDesc, |
|
|
|
|
const TensorDescriptor<T>& outputDesc) |
|
|
|
|
{ |
|
|
|
|
#if CUDNN_MAJOR >= 8 |
|
|
|
|
int requestedAlgoCount = 0, returnedAlgoCount = 0; |
|
|
|
|
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount)); |
|
|
|
|
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount); |
|
|
|
|
CUDA4DNN_CHECK_CUDNN( |
|
|
|
|
cudnnGetConvolutionForwardAlgorithm_v7( |
|
|
|
|
handle.get(), |
|
|
|
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), |
|
|
|
|
requestedAlgoCount, |
|
|
|
|
&returnedAlgoCount, |
|
|
|
|
&results[0] |
|
|
|
|
) |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
size_t free_memory, total_memory; |
|
|
|
|
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory)); |
|
|
|
|
|
|
|
|
|
bool found_conv_algorithm = false; |
|
|
|
|
for (int i = 0; i < returnedAlgoCount; i++) |
|
|
|
|
{ |
|
|
|
|
if (results[i].status == CUDNN_STATUS_SUCCESS && |
|
|
|
|
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && |
|
|
|
|
results[i].memory < free_memory) |
|
|
|
|
{ |
|
|
|
|
found_conv_algorithm = true; |
|
|
|
|
algo = results[i].algo; |
|
|
|
|
workspace_size = results[i].memory; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (!found_conv_algorithm) |
|
|
|
|
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution."); |
|
|
|
|
#else |
|
|
|
|
CUDA4DNN_CHECK_CUDNN( |
|
|
|
|
cudnnGetConvolutionForwardAlgorithm( |
|
|
|
|
handle.get(), |
|
|
|
|
input.get(), filter.get(), conv.get(), output.get(), |
|
|
|
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), |
|
|
|
|
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, |
|
|
|
|
0, /* no memory limit */ |
|
|
|
|
&algo |
|
|
|
@ -272,10 +315,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu |
|
|
|
|
CUDA4DNN_CHECK_CUDNN( |
|
|
|
|
cudnnGetConvolutionForwardWorkspaceSize( |
|
|
|
|
handle.get(), |
|
|
|
|
input.get(), filter.get(), conv.get(), output.get(), |
|
|
|
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(), |
|
|
|
|
algo, &workspace_size |
|
|
|
|
) |
|
|
|
|
); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; |
|
|
|
|