diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp index 6dca272366..ca6eb2cf86 100644 --- a/modules/core/include/opencv2/core/cuda.hpp +++ b/modules/core/include/opencv2/core/cuda.hpp @@ -329,11 +329,120 @@ CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr /** @brief BufferPool for use with CUDA streams - * BufferPool utilizes cuda::Stream's allocator to create new buffers. It is - * particularly useful when BufferPoolUsage is set to true, or a custom - * allocator is specified for the cuda::Stream, and you want to implement your - * own stream based functions utilizing the same underlying GPU memory - * management. +BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is +only useful when enabled with #setBufferPoolUsage. + +@code + setBufferPoolUsage(true); +@endcode + +@note #setBufferPoolUsage must be called \em before any Stream declaration. + +Users may specify custom allocator for Stream and may implement their own stream based +functions utilizing the same underlying GPU memory management. + +If custom allocator is not specified, BufferPool utilizes StackAllocator by +default. StackAllocator allocates a chunk of GPU device memory beforehand, +and when GpuMat is declared later on, it is given the pre-allocated memory. +This kind of strategy reduces the number of calls for memory allocating APIs +such as cudaMalloc or cudaMallocPitch. + +Below is an example that utilizes BufferPool with StackAllocator: + +@code + #include + + using namespace cv; + using namespace cv::cuda + + int main() + { + setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool + setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2); // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks) + + Stream stream1, stream2; // Each stream uses 1 stack + BufferPool pool1(stream1), pool2(stream2); + + GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1); // 16MB + GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3); // 48MB, pool1 is now full + + GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1); // 1MB + GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3); // 3MB + + cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1); + cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2); + } +@endcode + +If we allocate another GpuMat on pool1 in the above example, it will be carried out by +the DefaultAllocator since the stack for pool1 is full. + +@code + GpuMat d_add1 = pool1.getBuffer(1024, 1024, CV_8UC1); // Stack for pool1 is full, memory is allocated with DefaultAllocator +@endcode + +If a third stream is declared in the above example, allocating with #getBuffer +within that stream will also be carried out by the DefaultAllocator becuase we've run out of +stacks. + +@code + Stream stream3; // Only 2 stacks were allocated, we've run out of stacks + BufferPool pool3(stream3); + GpuMat d_src3 = pool3.getBuffer(1024, 1024, CV_8UC1); // Memory is allocated with DefaultAllocator +@endcode + +@warning When utilizing StackAllocator, deallocation order is important. + +Just like a stack, deallocation must be done in LIFO order. Below is an example of +erroneous usage that violates LIFO rule. If OpenCV is compiled in Debug mode, this +sample code will emit CV_Assert error. + +@code + int main() + { + setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool + Stream stream; // A default size (10 MB) stack is allocated to this stream + BufferPool pool(stream); + + GpuMat mat1 = pool.getBuffer(1024, 1024, CV_8UC1); // Allocate mat1 (1MB) + GpuMat mat2 = pool.getBuffer(1024, 1024, CV_8UC1); // Allocate mat2 (1MB) + + mat1.release(); // erroneous usage : mat2 must be deallocated before mat1 + } +@endcode + +Since C++ local variables are destroyed in the reverse order of construction, +the code sample below satisfies the LIFO rule. Local GpuMat's are deallocated +and the corresponding memory is automatically returned to the pool for later usage. + +@code + int main() + { + setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool + setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2); // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks) + + Stream stream1, stream2; // Each stream uses 1 stack + BufferPool pool1(stream1), pool2(stream2); + + for (int i = 0; i < 10; i++) + { + GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1); // 16MB + GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3); // 48MB, pool1 is now full + + GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1); // 1MB + GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3); // 3MB + + d_src1.setTo(Scalar(i), stream1); + d_src2.setTo(Scalar(i), stream2); + + cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1); + cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2); + // The order of destruction of the local variables is: + // d_dst2 => d_src2 => d_dst1 => d_src1 + // LIFO rule is satisfied, this code runs without error + } + } +@endcode */ class CV_EXPORTS BufferPool {