From 769564c1302eef0091aef064af5a3c95b5560e7b Mon Sep 17 00:00:00 2001 From: Andrey Morozov Date: Mon, 26 Jul 2010 11:22:16 +0000 Subject: [PATCH] implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo() --- modules/gpu/include/opencv2/gpu/gpu.hpp | 102 +++++++++++----------- modules/gpu/src/cuda/cuda_shared.hpp | 8 +- modules/gpu/src/cuda/matrix_operations.cu | 87 +++++++++++------- modules/gpu/src/cudastream.cpp | 3 +- 4 files changed, 114 insertions(+), 86 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index c50351e841..28544bc686 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -49,24 +49,24 @@ namespace cv { namespace gpu - { + { //////////////////////////////// Initialization //////////////////////// - + //! This is the only function that do not throw exceptions if the library is compiled without Cuda. CV_EXPORTS int getCudaEnabledDeviceCount(); //! Functions below throw cv::Expception if the library is compiled without Cuda. CV_EXPORTS string getDeviceName(int device); - CV_EXPORTS void setDevice(int device); - CV_EXPORTS int getDevice(); + CV_EXPORTS void setDevice(int device); + CV_EXPORTS int getDevice(); CV_EXPORTS void getComputeCapability(int device, int* major, int* minor); CV_EXPORTS int getNumberOfSMs(int device); - + //////////////////////////////// GpuMat //////////////////////////////// - class CudaStrem; + class CudaStream; - //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. + //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. class CV_EXPORTS GpuMat { public: @@ -81,7 +81,7 @@ namespace cv GpuMat(Size _size, int _type, const Scalar& _s); //! copy constructor GpuMat(const GpuMat& m); - + //! constructor for GpuMatrix headers pointing to user-allocated data GpuMat(int _rows, int _cols, int _type, void* _data, size_t _step = Mat::AUTO_STEP); GpuMat(Size _size, int _type, void* _data, size_t _step = Mat::AUTO_STEP); @@ -89,7 +89,7 @@ namespace cv //! creates a matrix header for a part of the bigger matrix GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange); GpuMat(const GpuMat& m, const Rect& roi); - + //! builds GpuMat from Mat. Perfom blocking upload to device. explicit GpuMat (const Mat& m); @@ -99,7 +99,7 @@ namespace cv //! assignment operators GpuMat& operator = (const GpuMat& m); //! assignment operator. Perfom blocking upload to device. - GpuMat& operator = (const Mat& m); + GpuMat& operator = (const Mat& m); //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code. // Contains just image size, data ptr and step. @@ -110,7 +110,7 @@ namespace cv //! Downloads data from device to host memory. Blocking calls. operator Mat() const; - void download(cv::Mat& m) const; + void download(cv::Mat& m) const; //! returns a new GpuMatrix header for the specified row GpuMat row(int y) const; @@ -161,7 +161,7 @@ namespace cv //! extracts a rectangular sub-GpuMatrix // (this is a generalized form of row, rowRange etc.) GpuMat operator()( Range rowRange, Range colRange ) const; - GpuMat operator()( const Rect& roi ) const; + GpuMat operator()( const Rect& roi ) const; //! returns true iff the GpuMatrix data is continuous // (i.e. when there are no gaps between successive rows). @@ -222,33 +222,33 @@ namespace cv // Page locked memory is only needed for async and faster coping to GPU. // It is convertable to cv::Mat header without reference counting // so you can use it with other opencv functions. - + class CV_EXPORTS MatPL { - public: + public: //Not supported. Now behaviour is like ALLOC_DEFAULT. //enum { ALLOC_DEFAULT = 0, ALLOC_PORTABLE = 1, ALLOC_WRITE_COMBINED = 4 } - MatPL(); - MatPL(const MatPL& m); + MatPL(); + MatPL(const MatPL& m); MatPL(int _rows, int _cols, int _type); - MatPL(Size _size, int _type); + MatPL(Size _size, int _type); //! creates from cv::Mat with coping data explicit MatPL(const Mat& m); - - ~MatPL(); + + ~MatPL(); MatPL& operator = (const MatPL& m); - + //! returns deep copy of the matrix, i.e. the data is copied MatPL clone() const; - - //! allocates new matrix data unless the matrix already has specified size and type. + + //! allocates new matrix data unless the matrix already has specified size and type. void create(int _rows, int _cols, int _type); - void create(Size _size, int _type); + void create(Size _size, int _type); //! decrements reference counter and released memory if needed. void release(); @@ -256,25 +256,25 @@ namespace cv //! returns matrix header with disabled reference counting for MatPL data. Mat createMatHeader() const; operator Mat() const; - + // Please see cv::Mat for descriptions - bool isContinuous() const; - size_t elemSize() const; - size_t elemSize1() const; - int type() const; - int depth() const; - int channels() const; - size_t step1() const; - Size size() const; + bool isContinuous() const; + size_t elemSize() const; + size_t elemSize1() const; + int type() const; + int depth() const; + int channels() const; + size_t step1() const; + Size size() const; bool empty() const; - + // Please see cv::Mat for descriptions - int flags; - int rows, cols; + int flags; + int rows, cols; size_t step; - uchar* data; - int* refcount; + uchar* data; + int* refcount; uchar* datastart; uchar* dataend; @@ -288,37 +288,37 @@ namespace cv class CV_EXPORTS CudaStream { public: - CudaStream(); + CudaStream(); ~CudaStream(); - CudaStream(const CudaStream&); + CudaStream(const CudaStream&); CudaStream& operator=(const CudaStream&); bool queryIfComplete(); - void waitForCompletion(); + void waitForCompletion(); - //! downloads asynchronously. + //! downloads asynchronously. // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its subMat) void enqueueDownload(const GpuMat& src, MatPL& dst); void enqueueDownload(const GpuMat& src, Mat& dst); - //! uploads asynchronously. + //! uploads asynchronously. // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its ROI) - void enqueueUpload(const MatPL& src, GpuMat& dst); + void enqueueUpload(const MatPL& src, GpuMat& dst); void enqueueUpload(const Mat& src, GpuMat& dst); void enqueueCopy(const GpuMat& src, GpuMat& dst); - - void enqueueMemSet(const GpuMat& src, Scalar val); + + void enqueueMemSet(const GpuMat& src, Scalar val); void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask); // converts matrix type, ex from float to uchar depending on type - void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); + void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); private: void create(); void release(); struct Impl; - Impl *impl; + Impl *impl; friend struct StreamAccessor; }; @@ -348,7 +348,7 @@ namespace cv //! Acync version void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream); - //! Some heuristics that tries to estmate + //! Some heuristics that tries to estmate // if current GPU will be faster then CPU in this algorithm. // It queries current active device. static bool checkIfGpuCallReasonable(); @@ -356,11 +356,11 @@ namespace cv int ndisp; int winSize; int preset; - + // If avergeTexThreshold == 0 => post procesing is disabled // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold - // i.e. input left image is low textured. + // i.e. input left image is low textured. float avergeTexThreshold; private: GpuMat minSSD, leBuf, riBuf; @@ -369,4 +369,4 @@ namespace cv } #include "opencv2/gpu/matrix_operations.hpp" -#endif /* __OPENCV_GPU_HPP__ */ \ No newline at end of file +#endif /* __OPENCV_GPU_HPP__ */ diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index 0b6a63b3d6..fbec7cff69 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -61,12 +61,12 @@ namespace cv { static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } - extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels); + extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); - extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels); - extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels); + extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0); + extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); - extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta); + extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0); } } } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 988cf7eaa6..b3cb37d6af 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -42,7 +42,6 @@ #include #include -//#include #include "cuda_shared.hpp" #include "cuda_runtime.h" @@ -239,19 +238,27 @@ namespace cv ////////////////////////////////// CopyTo ///////////////////////////////// /////////////////////////////////////////////////////////////////////////// - typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels); + typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); template - void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels) + void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream) { dim3 threadsPerBlock(16,16, 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); - ::mat_operators::kernel_copy_to_with_mask<<>> - ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); - cudaSafeCall ( cudaThreadSynchronize() ); + if (stream == 0) + { + ::mat_operators::kernel_copy_to_with_mask<<>> + ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + ::mat_operators::kernel_copy_to_with_mask<<>> + ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); + } } - extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels) + extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) { static CopyToFunc tab[8] = { @@ -269,7 +276,7 @@ namespace cv if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); - func(mat_src, mat_dst, mask, channels); + func(mat_src, mat_dst, mask, channels, stream); } @@ -277,28 +284,43 @@ namespace cv ////////////////////////////////// SetTo ////////////////////////////////// /////////////////////////////////////////////////////////////////////////// - typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels); - typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels); + typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); + typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); template - void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels) + void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream) { dim3 threadsPerBlock(32, 8, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); - cudaSafeCall ( cudaThreadSynchronize() ); + if (stream == 0) + { + ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); + } + } template - void set_to_without_mask_run(const DevMem2D& mat, int channels) + void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream) { dim3 threadsPerBlock(32, 8, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - ::mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); - cudaSafeCall ( cudaThreadSynchronize() ); + if (stream == 0) + { + ::mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + ::mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); + } } - extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels) + extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream) { double data[4]; data[0] = scalar[0]; @@ -323,11 +345,11 @@ namespace cv if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); - func(mat, channels); + func(mat, channels, stream); } - extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels) + extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) { double data[4]; data[0] = scalar[0]; @@ -352,7 +374,7 @@ namespace cv if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); - func(mat, mask, channels); + func(mat, mask, channels, stream); } @@ -360,22 +382,27 @@ namespace cv //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// - typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta); + typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); template - void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta) + void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) { const int shift = ::mat_operators::ReadWriteTraits::shift; dim3 block(32, 8); dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); - - ::mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); - - cudaSafeCall( cudaThreadSynchronize() ); - } - - extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta) + if (stream == 0) + { + ::mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); + cudaSafeCall( cudaThreadSynchronize() ); + } + else + { + ::mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); + } + } + + extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) { static CvtFunc tab[8][8] = { @@ -406,7 +433,7 @@ namespace cv CvtFunc func = tab[sdepth][ddepth]; if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); - func(src, dst, width, height, alpha, beta); + func(src, dst, width, height, alpha, beta, stream); } } // namespace impl } // namespace gpu diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp index d17fdb0051..8c5b69ae84 100644 --- a/modules/gpu/src/cudastream.cpp +++ b/modules/gpu/src/cudastream.cpp @@ -74,6 +74,7 @@ struct CudaStream::Impl cudaStream_t stream; int ref_counter; }; + namespace { template void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k) @@ -147,7 +148,7 @@ void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst) { // if not -> allocation will be done, but after that dst will not point to page locked memory CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() ) - devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); + devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); } void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }