Merge pull request #17788 from YashasSamaga:cuda4dnn-nice-build

pull/17882/head
Alexander Alekhin 5 years ago
commit cd0f0384ef
  1. 14
      modules/dnn/CMakeLists.txt
  2. 96
      modules/dnn/src/cuda4dnn/init.hpp
  3. 100
      modules/dnn/src/dnn.cpp

@ -27,8 +27,18 @@ ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
AND HAVE_CUDNN
)
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
add_definitions(-DCV_CUDA4DNN=1)
if(OPENCV_DNN_CUDA)
if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
add_definitions(-DCV_CUDA4DNN=1)
else()
if(NOT HAVE_CUDA)
message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
elseif(NOT HAVE_CUBLAS)
message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
elseif(NOT HAVE_CUDNN)
message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
endif()
endif()
endif()
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")

@ -0,0 +1,96 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
#include "csl/error.hpp"
#include <cuda_runtime.h>
#include <cudnn.h>
#include <opencv2/core/cuda.hpp>
#include <sstream>
namespace cv { namespace dnn { namespace cuda4dnn {
void checkVersions()
{
int cudart_version = 0;
CUDA4DNN_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version));
if (cudart_version != CUDART_VERSION)
{
std::ostringstream oss;
oss << "CUDART reports version " << cudart_version << " which does not match with the version " << CUDART_VERSION << " with which OpenCV was built";
CV_LOG_WARNING(NULL, oss.str().c_str());
}
auto cudnn_version = cudnnGetVersion();
if (cudnn_version != CUDNN_VERSION)
{
std::ostringstream oss;
oss << "cuDNN reports version " << cudnn_version << " which does not match with the version " << CUDNN_VERSION << " with which OpenCV was built";
CV_LOG_WARNING(NULL, oss.str().c_str());
}
auto cudnn_cudart_version = cudnnGetCudartVersion();
if (cudart_version != cudnn_cudart_version)
{
std::ostringstream oss;
oss << "CUDART version " << cudnn_cudart_version << " reported by cuDNN " << cudnn_version << " does not match with the version reported by CUDART " << cudart_version;
CV_LOG_WARNING(NULL, oss.str().c_str());
}
}
int getDeviceCount()
{
return cuda::getCudaEnabledDeviceCount();
}
int getDevice()
{
int device_id = -1;
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id));
return device_id;
}
bool isDeviceCompatible()
{
int device_id = getDevice();
if (device_id < 0)
return false;
int major = 0, minor = 0;
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor))
return true;
for (int i = minor; i >= 0; i--)
if (cv::cuda::TargetArchs::hasBin(major, i))
return true;
return false;
}
bool doesDeviceSupportFP16()
{
int device_id = getDevice();
if (device_id < 0)
return false;
int major = 0, minor = 0;
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
int version = major * 10 + minor;
if (version < 53)
return false;
return true;
}
}}} /* namespace cv::dnn::cuda4dnn */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */

@ -47,7 +47,8 @@
#include "op_cuda.hpp"
#ifdef HAVE_CUDA
#include "cuda4dnn/primitives/eltwise.hpp"
#include "cuda4dnn/init.hpp"
#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers
#endif
#include "halide_scheduler.hpp"
@ -66,8 +67,6 @@
#include <opencv2/core/utils/configuration.private.hpp>
#include <opencv2/core/utils/logger.hpp>
#include <opencv2/core/cuda.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
@ -159,23 +158,6 @@ public:
}
#endif
#ifdef HAVE_CUDA
static inline bool cudaDeviceSupportsFp16() {
if (cv::cuda::getCudaEnabledDeviceCount() <= 0)
return false;
const int devId = cv::cuda::getDevice();
if (devId<0)
return false;
cv::cuda::DeviceInfo dev_info(devId);
if (!dev_info.isCompatible())
return false;
int version = dev_info.majorVersion() * 10 + dev_info.minorVersion();
if (version < 53)
return false;
return true;
}
#endif
private:
BackendRegistry()
{
@ -247,9 +229,10 @@ private:
#endif
#ifdef HAVE_CUDA
if (haveCUDA()) {
if (haveCUDA() && cuda4dnn::isDeviceCompatible())
{
backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
if (cudaDeviceSupportsFp16())
if (cuda4dnn::doesDeviceSupportFP16())
backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
}
#endif
@ -1189,19 +1172,6 @@ struct Net::Impl : public detail::NetImplBase
preferableBackend = DNN_BACKEND_DEFAULT;
preferableTarget = DNN_TARGET_CPU;
skipInfEngineInit = false;
#ifdef HAVE_CUDA
if (cv::cuda::getCudaEnabledDeviceCount() > 0)
{
cuda4dnn::csl::CSLContext context;
context.stream = cuda4dnn::csl::Stream(true);
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
}
#endif
}
Ptr<DataLayer> netInputLayer;
@ -1300,13 +1270,6 @@ struct Net::Impl : public detail::NetImplBase
}
Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA)
{
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
}
#endif
backendWrappers[data] = wrapper;
return wrapper;
}
@ -2374,10 +2337,57 @@ struct Net::Impl : public detail::NetImplBase
#endif
}
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) {
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_)
{
CV_Assert(haveCUDA());
CV_Assert(preferableBackend == DNN_BACKEND_CUDA);
#ifdef HAVE_CUDA
if (cuda4dnn::getDeviceCount() <= 0)
CV_Error(Error::StsError, "No CUDA capable device found.");
if (cuda4dnn::getDevice() < 0)
CV_Error(Error::StsError, "No CUDA capable device selected.");
if (!cuda4dnn::isDeviceCompatible())
CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
if (!cudaInfo)
{
cuda4dnn::csl::CSLContext context;
context.stream = cuda4dnn::csl::Stream(true);
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
cuda4dnn::checkVersions();
}
cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
for (auto& layer : layers)
{
auto& ld = layer.second;
if (ld.id == 0)
{
for (auto& wrapper : ld.inputBlobsWrappers)
{
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
}
}
for (auto& wrapper : ld.outputBlobsWrappers)
{
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
}
}
for (auto& layer : layers)
{
auto& ld = layer.second;
@ -2653,11 +2663,11 @@ struct Net::Impl : public detail::NetImplBase
if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
{
// we create a temporary backend node for eltwise layer to obtain the eltwise configuration
auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
nextEltwiseLayer = Ptr<EltwiseLayer>();
nextEltwiseLayer = Ptr<EltwiseLayer>();
// check for variable channels
auto& inputs = nextData->inputBlobs;

Loading…
Cancel
Save