diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp index f3da470899..f05d51dc9a 100644 --- a/modules/core/include/opencv2/core/cuda.hpp +++ b/modules/core/include/opencv2/core/cuda.hpp @@ -344,6 +344,8 @@ class CV_EXPORTS HostMem public: enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 }; + static MatAllocator* getAllocator(AllocType alloc_type = PAGE_LOCKED); + explicit HostMem(AllocType alloc_type = PAGE_LOCKED); HostMem(const HostMem& m); diff --git a/modules/core/src/cuda_host_mem.cpp b/modules/core/src/cuda_host_mem.cpp index dafa4f1621..2ad733b675 100644 --- a/modules/core/src/cuda_host_mem.cpp +++ b/modules/core/src/cuda_host_mem.cpp @@ -42,10 +42,124 @@ //M*/ #include "precomp.hpp" +#include using namespace cv; using namespace cv::cuda; +#ifdef HAVE_CUDA + +namespace { + +class HostMemAllocator : public MatAllocator +{ +public: + explicit HostMemAllocator(unsigned int flags) : flags_(flags) + { + } + + UMatData* allocate(int dims, const int* sizes, int type, + void* data0, size_t* step, + int /*flags*/, UMatUsageFlags /*usageFlags*/) const + { + size_t total = CV_ELEM_SIZE(type); + for (int i = dims-1; i >= 0; i--) + { + if (step) + { + if (data0 && step[i] != CV_AUTOSTEP) + { + CV_Assert(total <= step[i]); + total = step[i]; + } + else + { + step[i] = total; + } + } + + total *= sizes[i]; + } + + UMatData* u = new UMatData(this); + u->size = total; + + if (data0) + { + u->data = u->origdata = static_cast(data0); + u->flags |= UMatData::USER_ALLOCATED; + } + else + { + void* ptr = 0; + cudaSafeCall( cudaHostAlloc(&ptr, total, flags_) ); + + u->data = u->origdata = static_cast(ptr); + } + + return u; + } + + bool allocate(UMatData* u, int /*accessFlags*/, UMatUsageFlags /*usageFlags*/) const + { + return (u != NULL); + } + + void deallocate(UMatData* u) const + { + CV_Assert(u->urefcount >= 0); + CV_Assert(u->refcount >= 0); + + if (u && u->refcount == 0) + { + if ( !(u->flags & UMatData::USER_ALLOCATED) ) + { + cudaFreeHost(u->origdata); + u->origdata = 0; + } + + delete u; + } + } + +private: + unsigned int flags_; +}; + +} // namespace + +#endif + +MatAllocator* cv::cuda::HostMem::getAllocator(AllocType alloc_type) +{ +#ifndef HAVE_CUDA + (void) alloc_type; + throw_no_cuda(); + return NULL; +#else + static std::map > allocators; + + unsigned int flag = cudaHostAllocDefault; + + switch (alloc_type) + { + case PAGE_LOCKED: flag = cudaHostAllocDefault; break; + case SHARED: flag = cudaHostAllocMapped; break; + case WRITE_COMBINED: flag = cudaHostAllocWriteCombined; break; + default: CV_Error(cv::Error::StsBadFlag, "Invalid alloc type"); + } + + Ptr& a = allocators[flag]; + + if (a.empty()) + { + a = makePtr(flag); + } + + return a.get(); +#endif +} + #ifdef HAVE_CUDA namespace { diff --git a/modules/core/test/cuda/test_stream.cpp b/modules/core/test/cuda/test_stream.cpp index bf3316eba3..a0e451a62a 100644 --- a/modules/core/test/cuda/test_stream.cpp +++ b/modules/core/test/cuda/test_stream.cpp @@ -129,6 +129,25 @@ CUDA_TEST_P(Async, Convert) stream.waitForCompletion(); } +CUDA_TEST_P(Async, HostMemAllocator) +{ + cv::cuda::Stream stream; + + cv::Mat h_dst; + h_dst.allocator = cv::cuda::HostMem::getAllocator(); + + d_src.upload(src, stream); + d_src.convertTo(d_dst, CV_32S, stream); + d_dst.download(h_dst, stream); + + stream.waitForCompletion(); + + cv::Mat dst_gold; + src.createMatHeader().convertTo(dst_gold, CV_32S); + + ASSERT_MAT_NEAR(dst_gold, h_dst, 0); +} + INSTANTIATE_TEST_CASE_P(CUDA_Stream, Async, ALL_DEVICES); #endif // HAVE_CUDA