Merge pull request #23371 from cudawarped:cuda_add_futher_python_interop

`cuda`: Add bindings to allow `GpuMat` and `Stream` objects to be initialized from memory initialized in other libraries
2 years ago · e3c5c0906b
parent 2e9eb05751 7539abecdb
commit e3c5c0906b
3 changed files with 45 additions and 0 deletions
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -567,6 +567,29 @@ The function does not reallocate memory if the matrix has proper attributes alre
 */
 CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);

+/** @brief Bindings overload to create a GpuMat from existing GPU memory.
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W GpuMat inline createGpuMatFromCudaMemory(int rows, int cols, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(rows, cols, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+};
+
+ /** @overload
+@param size 2D array size: Size(cols, rows). In the Size() constructor, the number of rows and the number of columns go in the reverse order.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W inline GpuMat createGpuMatFromCudaMemory(Size size, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(size, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+};
+
 /** @brief BufferPool for use with CUDA streams

 BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is
@ -921,6 +944,13 @@ private:
    friend class DefaultDeviceInitializer;
 };

+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CUDA Runtime API stream pointer (cudaStream_t).
+@param cudaStreamMemoryAddress Memory address stored in a CUDA Runtime API stream pointer (cudaStream_t). The created Stream object does not perform any allocation or deallocation and simply wraps existing raw CUDA Runtime API stream pointer.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W Stream wrapStream(size_t cudaStreamMemoryAddress);
+
 class CV_EXPORTS_W Event
 {
 public:
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -586,6 +586,15 @@ Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream)

 #endif

+Stream cv::cuda::wrapStream(size_t cudaStreamMemoryAddress) {
+#ifndef HAVE_CUDA
+    CV_UNUSED(cudaStreamMemoryAddress);
+    throw_no_cuda();
+#else
+    return cv::cuda::StreamAccessor::wrapStream(reinterpret_cast<cudaStream_t>(cudaStreamMemoryAddress));
+#endif
+}
+
 /////////////////////////////////////////////////////////////
 /// StackAllocator

--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@ -40,8 +40,14 @@ class cuda_test(NewOpenCVTests):
        cuMat = cv.cuda_GpuMat()
        cuMat.upload(npMat)
        self.assertTrue(cuMat.cudaPtr() != 0)
+        cuMatFromPtrSz = cv.cuda.createGpuMatFromCudaMemory(cuMat.size(),cuMat.type(),cuMat.cudaPtr(), cuMat.step)
+        self.assertTrue(cuMat.cudaPtr() == cuMatFromPtrSz.cudaPtr())
+        cuMatFromPtrRc = cv.cuda.createGpuMatFromCudaMemory(cuMat.size()[1],cuMat.size()[0],cuMat.type(),cuMat.cudaPtr(), cuMat.step)
+        self.assertTrue(cuMat.cudaPtr() == cuMatFromPtrRc.cudaPtr())
        stream = cv.cuda_Stream()
        self.assertTrue(stream.cudaPtr() != 0)
+        streamFromPtr = cv.cuda.wrapStream(stream.cudaPtr())
+        self.assertTrue(stream.cudaPtr() == streamFromPtr.cudaPtr())
        asyncstream = cv.cuda_Stream(1)  # cudaStreamNonBlocking
        self.assertTrue(asyncstream.cudaPtr() != 0)