diff --git a/modules/core/include/opencv2/core/gpu.hpp b/modules/core/include/opencv2/core/gpu.hpp index c22803bad0..46d07100a8 100644 --- a/modules/core/include/opencv2/core/gpu.hpp +++ b/modules/core/include/opencv2/core/gpu.hpp @@ -51,8 +51,7 @@ #include "opencv2/core.hpp" #include "opencv2/core/gpu_types.hpp" -namespace cv { namespace gpu -{ +namespace cv { namespace gpu { //////////////////////////////// GpuMat /////////////////////////////// @@ -337,59 +336,56 @@ CV_EXPORTS void registerPageLocked(Mat& m); //! unmaps the memory of matrix m, and makes it pageable again CV_EXPORTS void unregisterPageLocked(Mat& m); -//////////////////////////////// CudaStream //////////////////////////////// +///////////////////////////////// Stream ////////////////////////////////// + // Encapculates Cuda Stream. Provides interface for async coping. // Passed to each function that supports async kernel execution. -// Reference counting is enabled +// Reference counting is enabled. class CV_EXPORTS Stream { + typedef void (Stream::*bool_type)() const; + void this_type_does_not_support_comparisons() const {} + public: + typedef void (*StreamCallback)(int status, void* userData); + + //! creates a new asynchronous stream Stream(); - ~Stream(); - Stream(const Stream&); - Stream& operator =(const Stream&); + //! queries an asynchronous stream for completion status + bool queryIfComplete() const; - bool queryIfComplete(); + //! waits for stream tasks to complete void waitForCompletion(); - //! downloads asynchronously - // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat) - void enqueueDownload(const GpuMat& src, CudaMem& dst); - void enqueueDownload(const GpuMat& src, Mat& dst); + //! adds a callback to be called on the host after all currently enqueued items in the stream have completed + void enqueueHostCallback(StreamCallback callback, void* userData); - //! uploads asynchronously - // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI) - void enqueueUpload(const CudaMem& src, GpuMat& dst); - void enqueueUpload(const Mat& src, GpuMat& dst); + //! return Stream object for default CUDA stream + static Stream& Null(); - //! copy asynchronously - void enqueueCopy(const GpuMat& src, GpuMat& dst); + //! returns true if stream object is not default (!= 0) + operator bool_type() const; - //! memory set asynchronously - void enqueueMemSet(GpuMat& src, Scalar val); - void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask); + // obsolete methods - //! converts matrix type, ex from float to uchar depending on type - void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0); + void enqueueDownload(const GpuMat& src, OutputArray dst); - //! adds a callback to be called on the host after all currently enqueued items in the stream have completed - typedef void (*StreamCallback)(Stream& stream, int status, void* userData); - void enqueueHostCallback(StreamCallback callback, void* userData); + void enqueueUpload(InputArray src, GpuMat& dst); - static Stream& Null(); + void enqueueCopy(const GpuMat& src, OutputArray dst); - operator bool() const; + void enqueueMemSet(GpuMat& src, Scalar val); + void enqueueMemSet(GpuMat& src, Scalar val, InputArray mask); -private: - struct Impl; + void enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha = 1.0, double beta = 0.0); - explicit Stream(Impl* impl); - void create(); - void release(); + class Impl; - Impl *impl; +private: + Ptr impl_; + Stream(const Ptr& impl); friend struct StreamAccessor; }; @@ -498,7 +494,13 @@ CV_EXPORTS void printCudaDeviceInfo(int device); CV_EXPORTS void printShortCudaDeviceInfo(int device); -}} // cv::gpu +}} // namespace cv { namespace gpu { + +namespace cv { + +template <> CV_EXPORTS void Ptr::delete_obj(); + +} #include "opencv2/core/gpu.inl.hpp" diff --git a/modules/core/include/opencv2/core/gpu.inl.hpp b/modules/core/include/opencv2/core/gpu.inl.hpp index 10b8ff5947..1983cbcb25 100644 --- a/modules/core/include/opencv2/core/gpu.inl.hpp +++ b/modules/core/include/opencv2/core/gpu.inl.hpp @@ -46,8 +46,7 @@ #include "opencv2/core/gpu.hpp" -namespace cv { namespace gpu -{ +namespace cv { namespace gpu { //////////////////////////////// GpuMat /////////////////////////////// @@ -524,7 +523,51 @@ void swap(CudaMem& a, CudaMem& b) a.swap(b); } -}} // namespace cv { namespace gpu +//////////////////////////////// Stream /////////////////////////////// + +inline +void Stream::enqueueDownload(const GpuMat& src, OutputArray dst) +{ + src.download(dst, *this); +} + +inline +void Stream::enqueueUpload(InputArray src, GpuMat& dst) +{ + dst.upload(src, *this); +} + +inline +void Stream::enqueueCopy(const GpuMat& src, OutputArray dst) +{ + src.copyTo(dst, *this); +} + +inline +void Stream::enqueueMemSet(GpuMat& src, Scalar val) +{ + src.setTo(val, *this); +} + +inline +void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask) +{ + src.setTo(val, mask, *this); +} + +inline +void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta) +{ + src.convertTo(dst, dtype, alpha, beta, *this); +} + +inline +Stream::Stream(const Ptr& impl) + : impl_(impl) +{ +} + +}} // namespace cv { namespace gpu { //////////////////////////////// Mat //////////////////////////////// diff --git a/modules/core/src/gpu_stream.cpp b/modules/core/src/gpu_stream.cpp index 251e3a2aab..cf90501593 100644 --- a/modules/core/src/gpu_stream.cpp +++ b/modules/core/src/gpu_stream.cpp @@ -45,170 +45,103 @@ using namespace cv; using namespace cv::gpu; -#if !defined (HAVE_CUDA) +#ifndef HAVE_CUDA -cv::gpu::Stream::Stream() { throw_no_cuda(); } -cv::gpu::Stream::~Stream() {} -cv::gpu::Stream::Stream(const Stream&) { throw_no_cuda(); } -Stream& cv::gpu::Stream::operator=(const Stream&) { throw_no_cuda(); return *this; } -bool cv::gpu::Stream::queryIfComplete() { throw_no_cuda(); return false; } -void cv::gpu::Stream::waitForCompletion() { throw_no_cuda(); } -void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_no_cuda(); } -void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_no_cuda(); } -Stream& cv::gpu::Stream::Null() { throw_no_cuda(); static Stream s; return s; } -cv::gpu::Stream::operator bool() const { throw_no_cuda(); return false; } -cv::gpu::Stream::Stream(Impl*) { throw_no_cuda(); } -void cv::gpu::Stream::create() { throw_no_cuda(); } -void cv::gpu::Stream::release() { throw_no_cuda(); } - -#else /* !defined (HAVE_CUDA) */ - -struct Stream::Impl +class cv::gpu::Stream::Impl { - static cudaStream_t getStream(const Impl* impl) +public: + Impl(void* ptr = 0) { - return impl ? impl->stream : 0; + (void) ptr; + throw_no_cuda(); } +}; + +#else +class cv::gpu::Stream::Impl +{ +public: cudaStream_t stream; - int ref_counter; + + Impl(); + Impl(cudaStream_t stream); + + ~Impl(); }; -cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) +cv::gpu::Stream::Impl::Impl() : stream(0) { - return Stream::Impl::getStream(stream.impl); + cudaSafeCall( cudaStreamCreate(&stream) ); } -cv::gpu::Stream::Stream() : impl(0) +cv::gpu::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_) { - create(); } -cv::gpu::Stream::~Stream() +cv::gpu::Stream::Impl::~Impl() { - release(); + if (stream) + cudaStreamDestroy(stream); } -cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl) +cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { - if (impl) - CV_XADD(&impl->ref_counter, 1); + return stream.impl_->stream; } -Stream& cv::gpu::Stream::operator =(const Stream& stream) -{ - if (this != &stream) - { - release(); - impl = stream.impl; - if (impl) - CV_XADD(&impl->ref_counter, 1); - } +#endif - return *this; +cv::gpu::Stream::Stream() +{ +#ifndef HAVE_CUDA + throw_no_cuda(); +#else + impl_ = new Impl; +#endif } -bool cv::gpu::Stream::queryIfComplete() +bool cv::gpu::Stream::queryIfComplete() const { - cudaStream_t stream = Impl::getStream(impl); - cudaError_t err = cudaStreamQuery(stream); +#ifndef HAVE_CUDA + throw_no_cuda(); + return false; +#else + cudaError_t err = cudaStreamQuery(impl_->stream); if (err == cudaErrorNotReady || err == cudaSuccess) return err == cudaSuccess; cudaSafeCall(err); return false; +#endif } void cv::gpu::Stream::waitForCompletion() { - cudaStream_t stream = Impl::getStream(impl); - cudaSafeCall( cudaStreamSynchronize(stream) ); -} - -void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst) -{ - // if not -> allocation will be done, but after that dst will not point to page locked memory - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - - cudaStream_t stream = Impl::getStream(impl); - size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) ); -} - -void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) -{ - dst.create(src.size(), src.type()); - - cudaStream_t stream = Impl::getStream(impl); - size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) ); -} - -void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst) -{ - dst.create(src.size(), src.type()); - - cudaStream_t stream = Impl::getStream(impl); - size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) ); -} - -void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) -{ - dst.create(src.size(), src.type()); - - cudaStream_t stream = Impl::getStream(impl); - size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) ); -} - -void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) -{ - dst.create(src.size(), src.type()); - - cudaStream_t stream = Impl::getStream(impl); - size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) ); -} - -void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) -{ - src.setTo(val, *this); -} - -void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) -{ - src.setTo(val, mask, *this); -} - -void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta) -{ - src.convertTo(dst, dtype, alpha, beta, *this); +#ifndef HAVE_CUDA + throw_no_cuda(); +#else + cudaSafeCall( cudaStreamSynchronize(impl_->stream) ); +#endif } -#if CUDART_VERSION >= 5000 +#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000) namespace { struct CallbackData { - cv::gpu::Stream::StreamCallback callback; + Stream::StreamCallback callback; void* userData; - Stream stream; + + CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {} }; void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData) { CallbackData* data = reinterpret_cast(userData); - data->callback(data->stream, static_cast(status), data->userData); + data->callback(static_cast(status), data->userData); delete data; } } @@ -217,58 +150,39 @@ namespace void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData) { -#if CUDART_VERSION >= 5000 - CallbackData* data = new CallbackData; - data->callback = callback; - data->userData = userData; - data->stream = *this; - - cudaStream_t stream = Impl::getStream(impl); - - cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) ); -#else +#ifndef HAVE_CUDA (void) callback; (void) userData; - CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0"); + throw_no_cuda(); +#else + #if CUDART_VERSION < 5000 + (void) callback; + (void) userData; + CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0"); + #else + CallbackData* data = new CallbackData(callback, userData); + + cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) ); + #endif #endif } -cv::gpu::Stream& cv::gpu::Stream::Null() +Stream& cv::gpu::Stream::Null() { - static Stream s((Impl*) 0); + static Stream s(new Impl(0)); return s; } -cv::gpu::Stream::operator bool() const -{ - return impl && impl->stream; -} - -cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) +cv::gpu::Stream::operator bool_type() const { +#ifndef HAVE_CUDA + return 0; +#else + return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0; +#endif } -void cv::gpu::Stream::create() -{ - if (impl) - release(); - - cudaStream_t stream; - cudaSafeCall( cudaStreamCreate( &stream ) ); - - impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl)); - - impl->stream = stream; - impl->ref_counter = 1; -} - -void cv::gpu::Stream::release() +template <> void cv::Ptr::delete_obj() { - if (impl && CV_XADD(&impl->ref_counter, -1) == 1) - { - cudaSafeCall( cudaStreamDestroy(impl->stream) ); - cv::fastFree(impl); - } + if (obj) delete obj; } - -#endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp index c605b989ea..a6cd1cb62e 100644 --- a/modules/gpuarithm/src/arithm.cpp +++ b/modules/gpuarithm/src/arithm.cpp @@ -217,10 +217,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G { if (src3.empty()) { - if (stream) - stream.enqueueMemSet(dst, Scalar::all(0)); - else - dst.setTo(Scalar::all(0)); + dst.setTo(Scalar::all(0), stream); } else { @@ -230,10 +227,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G } else { - if (stream) - stream.enqueueCopy(src3, dst); - else - src3.copyTo(dst); + src3.copyTo(dst, stream); } } } @@ -336,18 +330,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream); sum.create(src.rows + 1, src.cols + 1, CV_32SC1); - if (s) - s.enqueueMemSet(sum, Scalar::all(0)); - else - sum.setTo(Scalar::all(0)); + + sum.setTo(Scalar::all(0), s); GpuMat inner = sum(Rect(1, 1, src.cols, src.rows)); GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - if (s) - s.enqueueCopy(res, inner); - else - res.copyTo(inner); + res.copyTo(inner, s); } else { @@ -720,10 +709,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, GpuMat result_block(result_roi_size, result_data.type(), result_data.ptr(), result_data.step); - if (stream) - stream.enqueueCopy(result_block, result_roi); - else - result_block.copyTo(result_roi); + result_block.copyTo(result_roi, stream); } } diff --git a/modules/gpubgsegm/src/gmg.cpp b/modules/gpubgsegm/src/gmg.cpp index f29bf45519..a38cbffaca 100644 --- a/modules/gpubgsegm/src/gmg.cpp +++ b/modules/gpubgsegm/src/gmg.cpp @@ -134,10 +134,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits::max() : 1.0f); fgmask.create(frameSize_, CV_8UC1); - if (stream) - stream.enqueueMemSet(fgmask, cv::Scalar::all(0)); - else - fgmask.setTo(cv::Scalar::all(0)); + fgmask.setTo(cv::Scalar::all(0), stream); funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream)); diff --git a/modules/gpufeatures2d/src/brute_force_matcher.cpp b/modules/gpufeatures2d/src/brute_force_matcher.cpp index e350d48cfa..feb0cc6928 100644 --- a/modules/gpufeatures2d/src/brute_force_matcher.cpp +++ b/modules/gpufeatures2d/src/brute_force_matcher.cpp @@ -497,10 +497,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); } - if (stream) - stream.enqueueMemSet(trainIdx, Scalar::all(-1)); - else - trainIdx.setTo(Scalar::all(-1)); + trainIdx.setTo(Scalar::all(-1), stream); caller_t func = callers[query.depth()]; CV_Assert(func != 0); @@ -616,10 +613,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx); ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); - if (stream) - stream.enqueueMemSet(trainIdx, Scalar::all(-1)); - else - trainIdx.setTo(Scalar::all(-1)); + trainIdx.setTo(Scalar::all(-1), stream); caller_t func = callers[query.depth()]; CV_Assert(func != 0); @@ -803,10 +797,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); } - if (stream) - stream.enqueueMemSet(nMatches, Scalar::all(0)); - else - nMatches.setTo(Scalar::all(0)); + nMatches.setTo(Scalar::all(0), stream); caller_t func = callers[query.depth()]; CV_Assert(func != 0); @@ -931,10 +922,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance); } - if (stream) - stream.enqueueMemSet(nMatches, Scalar::all(0)); - else - nMatches.setTo(Scalar::all(0)); + nMatches.setTo(Scalar::all(0), stream); caller_t func = callers[query.depth()]; CV_Assert(func != 0); diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp index 8232ab804c..26442f546f 100644 --- a/modules/gpufilters/src/filtering.cpp +++ b/modules/gpufilters/src/filtering.cpp @@ -157,10 +157,7 @@ namespace if (roi.size() != src_size) { - if (stream) - stream.enqueueMemSet(dst, Scalar::all(0)); - else - dst.setTo(Scalar::all(0)); + dst.setTo(Scalar::all(0), stream); } normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); @@ -221,10 +218,7 @@ namespace if (roi.size() != src_size) { - if (stream) - stream.enqueueMemSet(dst, Scalar::all(0)); - else - dst.setTo(Scalar::all(0)); + dst.setTo(Scalar::all(0), stream); } ensureSizeIsEnough(src_size, bufType, *pbuf); @@ -487,10 +481,7 @@ namespace if (roi.size() != src_size) { - if (stream) - stream.enqueueMemSet(dst, Scalar::all(0)); - else - dst.setTo(Scalar::all(0)); + dst.setTo(Scalar::all(0), stream); } normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); @@ -557,10 +548,7 @@ namespace if (iterations == 0 || _kernel.rows * _kernel.cols == 1) { - if (stream) - stream.enqueueCopy(src, dst); - else - src.copyTo(dst); + src.copyTo(dst, stream); return; } diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp index 17d7b7686e..008d3da1ce 100644 --- a/modules/gpuimgproc/src/match_template.cpp +++ b/modules/gpuimgproc/src/match_template.cpp @@ -196,16 +196,9 @@ namespace return; } - if (stream) - { - stream.enqueueConvert(image, buf.imagef, CV_32F); - stream.enqueueConvert(templ, buf.templf, CV_32F); - } - else - { - image.convertTo(buf.imagef, CV_32F); - templ.convertTo(buf.templf, CV_32F); - } + image.convertTo(buf.imagef, CV_32F, stream); + templ.convertTo(buf.templf, CV_32F, stream); + matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream); } @@ -317,16 +310,8 @@ namespace void matchTemplate_CCOFF_NORMED_8U( const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { - if (stream) - { - stream.enqueueConvert(image, buf.imagef, CV_32F); - stream.enqueueConvert(templ, buf.templf, CV_32F); - } - else - { - image.convertTo(buf.imagef, CV_32F); - templ.convertTo(buf.templf, CV_32F); - } + image.convertTo(buf.imagef, CV_32F, stream); + templ.convertTo(buf.templf, CV_32F, stream); matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream); diff --git a/modules/gpuoptflow/src/farneback.cpp b/modules/gpuoptflow/src/farneback.cpp index 60a9cda67c..9ed6403eea 100644 --- a/modules/gpuoptflow/src/farneback.cpp +++ b/modules/gpuoptflow/src/farneback.cpp @@ -235,8 +235,8 @@ void cv::gpu::FarnebackOpticalFlow::operator ()( break; } - streams[0].enqueueConvert(frame0, frames_[0], CV_32F); - streams[1].enqueueConvert(frame1, frames_[1], CV_32F); + frame0.convertTo(frames_[0], CV_32F, streams[0]); + frame1.convertTo(frames_[1], CV_32F, streams[1]); if (fastPyramids) { @@ -293,21 +293,21 @@ void cv::gpu::FarnebackOpticalFlow::operator ()( { gpu::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); gpu::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); - streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale); - streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale); + curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]); + curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]); } else { - streams[0].enqueueMemSet(curFlowX, 0); - streams[1].enqueueMemSet(curFlowY, 0); + curFlowX.setTo(0, streams[0]); + curFlowY.setTo(0, streams[1]); } } else { gpu::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); gpu::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); - streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale); - streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale); + curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]); + curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]); } GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); @@ -343,7 +343,7 @@ void cv::gpu::FarnebackOpticalFlow::operator ()( { cudev::optflow_farneback::gaussianBlurGpu( frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i])); - gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]); + gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]); cudev::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i])); } } diff --git a/modules/gpustereo/src/disparity_bilateral_filter.cpp b/modules/gpustereo/src/disparity_bilateral_filter.cpp index ef5be018da..d13fcc004f 100644 --- a/modules/gpustereo/src/disparity_bilateral_filter.cpp +++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp @@ -113,10 +113,7 @@ namespace if (&dst != &disp) { - if (stream) - stream.enqueueCopy(disp, dst); - else - disp.copyTo(dst); + disp.copyTo(dst, stream); } disp_bilateral_filter(dst, img, img.channels(), iters, StreamAccessor::getStream(stream)); diff --git a/modules/gpustereo/src/stereobp.cpp b/modules/gpustereo/src/stereobp.cpp index 957eb70144..5ce56c1d70 100644 --- a/modules/gpustereo/src/stereobp.cpp +++ b/modules/gpustereo/src/stereobp.cpp @@ -194,20 +194,10 @@ namespace if (rthis.levels & 1) { //can clear less area - if (stream) - { - stream.enqueueMemSet(u, zero); - stream.enqueueMemSet(d, zero); - stream.enqueueMemSet(l, zero); - stream.enqueueMemSet(r, zero); - } - else - { - u.setTo(zero); - d.setTo(zero); - l.setTo(zero); - r.setTo(zero); - } + u.setTo(zero, stream); + d.setTo(zero, stream); + l.setTo(zero, stream); + r.setTo(zero, stream); } if (rthis.levels > 1) @@ -222,20 +212,10 @@ namespace if ((rthis.levels & 1) == 0) { - if (stream) - { - stream.enqueueMemSet(u2, zero); - stream.enqueueMemSet(d2, zero); - stream.enqueueMemSet(l2, zero); - stream.enqueueMemSet(r2, zero); - } - else - { - u2.setTo(zero); - d2.setTo(zero); - l2.setTo(zero); - r2.setTo(zero); - } + u2.setTo(zero, stream); + d2.setTo(zero, stream); + l2.setTo(zero, stream); + r2.setTo(zero, stream); } } @@ -313,20 +293,12 @@ namespace out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); - if (stream) - stream.enqueueMemSet(out, zero); - else - out.setTo(zero); + out.setTo(zero, stream); output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream); if (disp.type() != CV_16S) - { - if (stream) - stream.enqueueConvert(out, disp, disp.type()); - else - out.convertTo(disp, disp.type()); - } + out.convertTo(disp, disp.type(), stream); } StereoBeliefPropagation& rthis; diff --git a/modules/gpustereo/src/stereocsbp.cpp b/modules/gpustereo/src/stereocsbp.cpp index bd5ef4be9f..cedba1eeb9 100644 --- a/modules/gpustereo/src/stereocsbp.cpp +++ b/modules/gpustereo/src/stereocsbp.cpp @@ -213,36 +213,18 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); - if (stream) - { - stream.enqueueMemSet(l[0], zero); - stream.enqueueMemSet(d[0], zero); - stream.enqueueMemSet(r[0], zero); - stream.enqueueMemSet(u[0], zero); - - stream.enqueueMemSet(l[1], zero); - stream.enqueueMemSet(d[1], zero); - stream.enqueueMemSet(r[1], zero); - stream.enqueueMemSet(u[1], zero); - - stream.enqueueMemSet(data_cost, zero); - stream.enqueueMemSet(data_cost_selected, zero); - } - else - { - l[0].setTo(zero); - d[0].setTo(zero); - r[0].setTo(zero); - u[0].setTo(zero); - - l[1].setTo(zero); - d[1].setTo(zero); - r[1].setTo(zero); - u[1].setTo(zero); - - data_cost.setTo(zero); - data_cost_selected.setTo(zero); - } + l[0].setTo(zero, stream); + d[0].setTo(zero, stream); + r[0].setTo(zero, stream); + u[0].setTo(zero, stream); + + l[1].setTo(zero, stream); + d[1].setTo(zero, stream); + r[1].setTo(zero, stream); + u[1].setTo(zero, stream); + + data_cost.setTo(zero, stream); + data_cost_selected.setTo(zero, stream); int cur_idx = 0; @@ -279,20 +261,14 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); - if (stream) - stream.enqueueMemSet(out, zero); - else - out.setTo(zero); + out.setTo(zero, stream); compute_disp(u[cur_idx].ptr(), d[cur_idx].ptr(), l[cur_idx].ptr(), r[cur_idx].ptr(), data_cost_selected.ptr(), disp_selected_pyr[cur_idx].ptr(), elem_step, out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) { - if (stream) - stream.enqueueConvert(out, disp, disp.type()); - else - out.convertTo(disp, disp.type()); + out.convertTo(disp, disp.type(), stream); } } diff --git a/modules/gpuwarping/src/pyramids.cpp b/modules/gpuwarping/src/pyramids.cpp index db9dd611a0..19d5dcf94a 100644 --- a/modules/gpuwarping/src/pyramids.cpp +++ b/modules/gpuwarping/src/pyramids.cpp @@ -184,10 +184,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows) { - if (stream) - stream.enqueueCopy(layer0_, outImg); - else - layer0_.copyTo(outImg); + layer0_.copyTo(outImg, stream); } float lastScale = 1.0f; @@ -202,10 +199,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows) { - if (stream) - stream.enqueueCopy(curLayer, outImg); - else - curLayer.copyTo(outImg); + curLayer.copyTo(outImg, stream); } if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows) diff --git a/modules/gpuwarping/src/resize.cpp b/modules/gpuwarping/src/resize.cpp index 68708b41a0..5cb5184833 100644 --- a/modules/gpuwarping/src/resize.cpp +++ b/modules/gpuwarping/src/resize.cpp @@ -77,10 +77,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub if (dsize == src.size()) { - if (s) - s.enqueueCopy(src, dst); - else - src.copyTo(dst); + src.copyTo(dst, s); return; } diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp index 3e7795df80..6119620e27 100644 --- a/modules/softcascade/src/detector_cuda.cpp +++ b/modules/softcascade/src/detector_cuda.cpp @@ -335,10 +335,7 @@ struct cv::softcascade::SCascade::Fields void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const { - if (s) - s.enqueueMemSet(objects, 0); - else - cudaMemset(objects.data, 0, sizeof(Detection)); + objects.setTo(Scalar::all(0), s); cudaSafeCall( cudaGetLastError()); @@ -354,16 +351,8 @@ struct cv::softcascade::SCascade::Fields cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps); - if (s) - { - s.enqueueMemSet(overlaps, 0); - s.enqueueMemSet(suppressed, 0); - } - else - { - overlaps.setTo(0); - suppressed.setTo(0); - } + overlaps.setTo(0, s); + suppressed.setTo(0, s); cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s); cudev::suppress(objects, overlaps, ndetections, suppressed, stream); @@ -488,18 +477,12 @@ void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat& cv::softcascade::cudev::shfl_integral(src, buffer, stream); sum.create(src.rows + 1, src.cols + 1, CV_32SC1); - if (s) - s.enqueueMemSet(sum, cv::Scalar::all(0)); - else - sum.setTo(cv::Scalar::all(0)); + sum.setTo(cv::Scalar::all(0), s); cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows)); cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows)); - if (s) - s.enqueueCopy(res, inner); - else - res.copyTo(inner); + res.copyTo(inner, s); } else {CV_Error(cv::Error::GpuNotSupported, ": CC 3.x required.");} } @@ -541,10 +524,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp } else { - if (s) - s.enqueueCopy(image, flds.hogluv); - else - image.copyTo(flds.hogluv); + image.copyTo(flds.hogluv, s); } flds.detect(objects, s); @@ -571,10 +551,7 @@ using cv::gpu::GpuMat; inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s) { - if (s) - s.enqueueMemSet(m, 0); - else - m.setTo(0); + m.setTo(0, s); } struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp index 3a0f99e499..87b5255990 100644 --- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp +++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp @@ -368,8 +368,8 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b) gpu::Stream stream; - stream.enqueueConvert(b.gI1, b.t1, CV_32F); - stream.enqueueConvert(b.gI2, b.t2, CV_32F); + b.gI1.convertTo(b.t1, CV_32F, stream); + b.gI2.convertTo(b.t2, CV_32F, stream); gpu::split(b.t1, b.vI1, stream); gpu::split(b.t2, b.vI2, stream); @@ -379,16 +379,16 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b) for( int i = 0; i < b.gI1.channels(); ++i ) { - gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, stream); // I2^2 - gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, stream); // I1^2 - gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, stream); // I1 * I2 + gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, 1, -1, stream); // I2^2 + gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream); // I1^2 + gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream); // I1 * I2 gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream); gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream); - gpu::multiply(b.mu1, b.mu1, b.mu1_2, stream); - gpu::multiply(b.mu2, b.mu2, b.mu2_2, stream); - gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, stream); + gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream); + gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream); + gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream); gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream); gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);