Dynamic CUDA support library loading implemented for Linux.

Logical mistake in macro fixed;
DeviceInfo deligate reimplemented;
Build and warning fixes.
pull/2022/head
Alexander Smorkalov 11 years ago
parent d4087f19a2
commit 8660e048bc
  1. 64
      modules/core/CMakeLists.txt
  2. 3
      modules/core/cuda/CMakeLists.txt
  3. 29
      modules/core/cuda/main.cpp
  4. 3
      modules/core/include/opencv2/core/gpumat.hpp
  5. 93
      modules/core/src/gpumat.cpp
  6. 382
      modules/core/src/gpumat_cuda.hpp

@ -1,29 +1,69 @@
set(the_description "The Core Functionality")
macro(ocv_glob_module_sources_no_cuda)
file(GLOB_RECURSE lib_srcs "src/*.cpp")
file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
set(cuda_objs "")
set(lib_cuda_hdrs "")
if(HAVE_CUDA)
ocv_include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
endif()
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
file(GLOB cl_kernels "src/opencl/*.cl")
if(HAVE_opencv_ocl AND cl_kernels)
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
endif()
source_group("Include" FILES ${lib_hdrs})
source_group("Include\\detail" FILES ${lib_hdrs_detail})
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
endmacro()
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
if(HAVE_WINRT)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
endif()
file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h")
file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
source_group("Cuda Headers" FILES ${lib_cuda_hdrs})
source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
if(DYNAMIC_CUDA_SUPPORT)
add_definitions(-DDYNAMIC_CUDA_SUPPORT)
else()
add_definitions(-DUSE_CUDA)
endif()
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
if(HAVE_CUDA)
ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
endif()
ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h")
file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
source_group("Cuda Headers" FILES ${lib_cuda_hdrs})
source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
if (DYNAMIC_CUDA_SUPPORT)
ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
else()
ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
endif()
ocv_create_module()
ocv_add_precompiled_headers(${the_module})
@ -31,6 +71,6 @@ ocv_add_precompiled_headers(${the_module})
ocv_add_accuracy_tests()
ocv_add_perf_tests()
if(DYNAMIC_CUDA_SUPPORT)
if (DYNAMIC_CUDA_SUPPORT)
add_subdirectory(cuda)
endif()

@ -1,6 +1,5 @@
project(opencv_core_cuda)
set(HAVE_CUDA FALSE)
add_definitions("-DHAVE_CUDA")
add_definitions(-DUSE_CUDA)
include_directories(${CUDA_INCLUDE_DIRS}
"../src/"
"../include/opencv2/core/"

@ -1,6 +1,10 @@
#include "cvconfig.h"
#include "opencv2/core/core.hpp"
#include "opencv2/core/gpumat.hpp"
#include <stdio.h>
#include <iostream>
#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#include <npp.h>
@ -17,7 +21,30 @@
#endif
#endif
using namespace std;
using namespace cv;
using namespace cv::gpu;
#include "gpumat_cuda.hpp"
#include "gpumat_cuda.hpp"
#ifdef HAVE_CUDA
static CudaDeviceInfoFuncTable deviceInfoTable;
static CudaFuncTable gpuTable;
#else
static EmptyDeviceInfoFuncTable deviceInfoTable;
static EmptyFuncTable gpuTable;
#endif
extern "C" {
DeviceInfoFuncTable* deviceInfoFactory()
{
return (DeviceInfoFuncTable*)&deviceInfoTable;
}
GpuFuncTable* gpuFactory()
{
return (GpuFuncTable*)&gpuTable;
}
}

@ -137,6 +137,9 @@ namespace cv { namespace gpu
int deviceID() const { return device_id_; }
private:
// Private section is fictive to preserve bin compatibility.
// Changes in the private fields there have no effects.
// see deligate code.
void query();
int device_id_;

@ -43,8 +43,9 @@
#include "precomp.hpp"
#include "opencv2/core/gpumat.hpp"
#include <iostream>
#include <dlfcn.h>
#if defined(HAVE_CUDA)
#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
#include <cuda_runtime.h>
#include <npp.h>
@ -66,15 +67,81 @@ using namespace cv::gpu;
#include "gpumat_cuda.hpp"
namespace
typedef GpuFuncTable* (*GpuFactoryType)();
typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
static GpuFactoryType gpuFactory = NULL;
static DeviceInfoFactoryType deviceInfoFactory = NULL;
static const std::string getCudaSupportLibName()
{
return "libopencv_core_cuda.so";
}
static bool loadCudaSupportLib()
{
const GpuFuncTable* gpuFuncTable()
void* handle;
const std::string name = getCudaSupportLibName();
handle = dlopen(name.c_str(), RTLD_LAZY);
if (!handle)
return false;
deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
if (!deviceInfoFactory)
{
static EmptyFuncTable funcTable;
return &funcTable;
dlclose(handle);
return false;
}
gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
if (!gpuFactory)
{
dlclose(handle);
return false;
}
dlclose(handle);
return true;
}
static GpuFuncTable* gpuFuncTable()
{
#ifdef DYNAMIC_CUDA_SUPPORT
static EmptyFuncTable stub;
static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
#else
# ifdef USE_CUDA
static CudaFuncTable impl;
static GpuFuncTable* funcTable = &impl;
#else
static EmptyFuncTable stub;
static GpuFuncTable* funcTable = &stub;
#endif
#endif
return funcTable;
}
static DeviceInfoFuncTable* deviceInfoFuncTable()
{
#ifdef DYNAMIC_CUDA_SUPPORT
static EmptyDeviceInfoFuncTable stub;
static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
#else
# ifdef USE_CUDA
static CudaDeviceInfoFuncTable impl;
static DeviceInfoFuncTable* funcTable = &impl;
#else
static EmptyFuncTable stub;
static DeviceInfoFuncTable* funcTable = &stub;
#endif
#endif
return funcTable;
}
//////////////////////////////// Initialization & Info ////////////////////////
int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
@ -556,7 +623,7 @@ namespace cv { namespace gpu
void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
{
gpuFuncTable()->setTo(src, s, stream);
gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
}
void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)

@ -1,30 +1,19 @@
namespace
{
#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
#ifndef __GPUMAT_CUDA_HPP__
#define __GPUMAT_CUDA_HPP__
inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
class DeviceInfoFuncTable
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
{
std::ostringstream msg;
msg << "NPP API Call Error: " << err;
cv::gpu::error(msg.str().c_str(), file, line, func);
}
}
#endif
}
namespace
{
public:
virtual size_t sharedMemPerBlock() const = 0;
virtual void queryMemory(size_t&, size_t&) const = 0;
virtual size_t freeMemory() const = 0;
virtual size_t totalMemory() const = 0;
virtual bool supports(FeatureSet) const = 0;
virtual bool isCompatible() const = 0;
virtual void query() = 0;
virtual ~DeviceInfoFuncTable() {};
};
class GpuFuncTable
{
public:
@ -40,6 +29,7 @@ namespace
virtual bool deviceSupports(FeatureSet) const = 0;
// TargetArchs
virtual bool builtWith(FeatureSet) const = 0;
virtual bool has(int, int) const = 0;
virtual bool hasPtx(int, int) const = 0;
@ -49,14 +39,6 @@ namespace
virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
virtual bool hasEqualOrGreaterBin(int, int) const = 0;
virtual size_t sharedMemPerBlock() const = 0;
virtual void queryMemory(size_t&, size_t&) const = 0;
virtual size_t freeMemory() const = 0;
virtual size_t totalMemory() const = 0;
virtual bool supports(FeatureSet) const = 0;
virtual bool isCompatible() const = 0;
virtual void query() const = 0;
virtual void printCudaDeviceInfo(int) const = 0;
virtual void printShortCudaDeviceInfo(int) const = 0;
@ -72,17 +54,24 @@ namespace
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
// for gpu::device::setTo funcs
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0;
};
}
#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
namespace
{
class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
{
public:
size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
void queryMemory(size_t&, size_t&) const { throw_nogpu; }
size_t freeMemory() const { throw_nogpu; return 0; }
size_t totalMemory() const { throw_nogpu; return 0; }
bool supports(FeatureSet) const { throw_nogpu; return false; }
bool isCompatible() const { throw_nogpu; return false; }
void query() { throw_nogpu; }
};
class EmptyFuncTable : public GpuFuncTable
{
public:
@ -105,15 +94,7 @@ namespace
bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
void queryMemory(size_t&, size_t&) const { throw_nogpu; }
size_t freeMemory() const { throw_nogpu; return 0; }
size_t totalMemory() const { throw_nogpu; return 0; }
bool supports(FeatureSet) const { throw_nogpu; return false; }
bool isCompatible() const { throw_nogpu; return false; }
void query() const { throw_nogpu; }
void printCudaDeviceInfo(int) const { throw_nogpu; }
void printShortCudaDeviceInfo(int) const { throw_nogpu; }
@ -126,15 +107,32 @@ namespace
void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
void free(void*) const {}
};
#if defined(USE_CUDA)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
#else
inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
{
std::ostringstream msg;
msg << "NPP API Call Error: " << err;
cv::gpu::error(msg.str().c_str(), file, line, func);
}
}
namespace cv { namespace gpu { namespace device
{
@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device
void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
}}}
namespace
{
template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
{
Scalar_<T> sf = s;
@ -162,10 +158,7 @@ namespace
Scalar_<T> sf = s;
cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
}
}
namespace
{
template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_8S> { typedef Npp8s npp_type; };
@ -208,6 +201,7 @@ namespace
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
@ -361,9 +355,8 @@ namespace
{
return reinterpret_cast<size_t>(ptr) % size == 0;
}
}
namespace cv { namespace gpu { namespace devices
namespace cv { namespace gpu { namespace device
{
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
{
@ -418,74 +411,52 @@ namespace
{
setTo(src, s, mask, 0);
}
}}
}}}
namespace
{
class CudaFuncTable : public GpuFuncTable
class CudaArch
{
protected:
class CudaArch
{
public:
CudaArch();
bool builtWith(FeatureSet feature_set) const;
bool hasPtx(int major, int minor) const;
bool hasBin(int major, int minor) const;
bool hasEqualOrLessPtx(int major, int minor) const;
bool hasEqualOrGreaterPtx(int major, int minor) const;
bool hasEqualOrGreaterBin(int major, int minor) const;
private:
static void fromStr(const string& set_as_str, vector<int>& arr);
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
const CudaArch cudaArch;
CudaArch::CudaArch()
public:
CudaArch()
{
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
}
bool CudaArch::builtWith(FeatureSet feature_set) const
bool builtWith(FeatureSet feature_set) const
{
return !features.empty() && (features.back() >= feature_set);
}
bool CudaArch::hasPtx(int major, int minor) const
bool hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool CudaArch::hasBin(int major, int minor) const
bool hasBin(int major, int minor) const
{
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
}
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
bool hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
bool hasEqualOrGreaterPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
bool hasEqualOrGreaterBin(int major, int minor) const
{
return !bin.empty() && (bin.back() >= major * 10 + minor);
}
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
private:
void fromStr(const string& set_as_str, vector<int>& arr)
{
if (set_as_str.find_first_not_of(" ") == string::npos)
return;
@ -501,25 +472,21 @@ namespace
sort(arr.begin(), arr.end());
}
class DeviceProps
{
public:
DeviceProps();
~DeviceProps();
cudaDeviceProp* get(int devID);
private:
std::vector<cudaDeviceProp*> props_;
};
DeviceProps::DeviceProps()
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
class DeviceProps
{
public:
DeviceProps()
{
props_.resize(10, 0);
}
DeviceProps::~DeviceProps()
~DeviceProps()
{
for (size_t i = 0; i < props_.size(); ++i)
{
@ -529,7 +496,7 @@ namespace
props_.clear();
}
cudaDeviceProp* DeviceProps::get(int devID)
cudaDeviceProp* get(int devID)
{
if (devID >= (int) props_.size())
props_.resize(devID + 5, 0);
@ -542,10 +509,92 @@ namespace
return props_[devID];
}
private:
std::vector<cudaDeviceProp*> props_;
};
DeviceProps deviceProps;
class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
{
public:
size_t sharedMemPerBlock() const
{
return deviceProps.get(device_id_)->sharedMemPerBlock;
}
void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
{
int prevDeviceID = getDevice();
if (prevDeviceID != device_id_)
setDevice(device_id_);
cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
if (prevDeviceID != device_id_)
setDevice(prevDeviceID);
}
DeviceProps deviceProps;
size_t freeMemory() const
{
size_t _totalMemory, _freeMemory;
queryMemory(_totalMemory, _freeMemory);
return _freeMemory;
}
size_t totalMemory() const
{
size_t _totalMemory, _freeMemory;
queryMemory(_totalMemory, _freeMemory);
return _totalMemory;
}
bool supports(FeatureSet feature_set) const
{
int version = majorVersion_ * 10 + minorVersion_;
return version >= feature_set;
}
bool isCompatible() const
{
// Check PTX compatibility
if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
return true;
// Check BIN compatibility
for (int i = minorVersion_; i >= 0; --i)
if (TargetArchs::hasBin(majorVersion_, i))
return true;
return false;
}
void query()
{
const cudaDeviceProp* prop = deviceProps.get(device_id_);
name_ = prop->name;
multi_processor_count_ = prop->multiProcessorCount;
majorVersion_ = prop->major;
minorVersion_ = prop->minor;
}
private:
int device_id_;
std::string name_;
int multi_processor_count_;
int majorVersion_;
int minorVersion_;
};
class CudaFuncTable : public GpuFuncTable
{
protected:
const CudaArch cudaArch;
int convertSMVer2Cores(int major, int minor)
int convertSMVer2Cores(int major, int minor) const
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
@ -600,42 +649,42 @@ namespace
cudaSafeCall( cudaDeviceReset() );
}
bool TargetArchs::builtWith(FeatureSet feature_set) const
bool builtWith(FeatureSet feature_set) const
{
return cudaArch.builtWith(feature_set);
}
bool TargetArchs::has(int major, int minor) const
bool has(int major, int minor) const
{
return hasPtx(major, minor) || hasBin(major, minor);
}
bool TargetArchs::hasPtx(int major, int minor) const
bool hasPtx(int major, int minor) const
{
return cudaArch.hasPtx(major, minor);
}
bool TargetArchs::hasBin(int major, int minor) const
bool hasBin(int major, int minor) const
{
return cudaArch.hasBin(major, minor);
}
bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
bool hasEqualOrLessPtx(int major, int minor) const
{
return cudaArch.hasEqualOrLessPtx(major, minor);
}
bool TargetArchs::hasEqualOrGreater(int major, int minor) const
bool hasEqualOrGreater(int major, int minor) const
{
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
}
bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
bool hasEqualOrGreaterPtx(int major, int minor) const
{
return cudaArch.hasEqualOrGreaterPtx(major, minor);
}
bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
bool hasEqualOrGreaterBin(int major, int minor) const
{
return cudaArch.hasEqualOrGreaterBin(major, minor);
}
@ -664,68 +713,7 @@ namespace
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
}
size_t sharedMemPerBlock() const
{
return deviceProps.get(device_id_)->sharedMemPerBlock;
}
void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
{
int prevDeviceID = getDevice();
if (prevDeviceID != device_id_)
setDevice(device_id_);
cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
if (prevDeviceID != device_id_)
setDevice(prevDeviceID);
}
size_t freeMemory() const
{
size_t _totalMemory, _freeMemory;
queryMemory(_totalMemory, _freeMemory);
return _freeMemory;
}
size_t totalMemory() const
{
size_t _totalMemory, _freeMemory;
queryMemory(_totalMemory, _freeMemory);
return _totalMemory;
}
bool supports(FeatureSet feature_set) const
{
int version = majorVersion() * 10 + minorVersion();
return version >= feature_set;
}
bool isCompatible() const
{
// Check PTX compatibility
if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
return true;
// Check BIN compatibility
for (int i = minorVersion(); i >= 0; --i)
if (TargetArchs::hasBin(majorVersion(), i))
return true;
return false;
}
void query() const
{
const cudaDeviceProp* prop = deviceProps.get(device_id_);
name_ = prop->name;
multi_processor_count_ = prop->multiProcessorCount;
majorVersion_ = prop->major;
minorVersion_ = prop->minor;
}
void printCudaDeviceInfo(int device) const
{
int count = getCudaEnabledDeviceCount();
@ -864,16 +852,16 @@ namespace
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
static const func_t funcs[7][4] =
{
/* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
/* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask },
/* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
/* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }
/* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
/* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask },
/* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
/* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }
};
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
func(src, dst, mask, 0);
}
@ -971,7 +959,7 @@ namespace
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
{
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
CV_Assert(dst.depth() <= CV_64F);
@ -982,10 +970,10 @@ namespace
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
cv::gpu::device::convertTo(src, dst, alpha, beta);
cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
}
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
{
if (mask.empty())
{
@ -1016,7 +1004,7 @@ namespace
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
};
CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
@ -1027,7 +1015,10 @@ namespace
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
funcs[m.depth()][m.channels() - 1](m, s);
if (stream)
cv::gpu::device::setTo(m, s, stream);
else
funcs[m.depth()][m.channels() - 1](m, s);
}
else
{
@ -1051,7 +1042,10 @@ namespace
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
funcs[m.depth()][m.channels() - 1](m, s, mask);
if (stream)
cv::gpu::device::setTo(m, s, mask, stream);
else
funcs[m.depth()][m.channels() - 1](m, s, mask);
}
}
@ -1065,5 +1059,5 @@ namespace
cudaFree(devPtr);
}
};
}
#endif
#endif
Loading…
Cancel
Save