From ea94b43541902c2914a166cc697ec218063d41a5 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Thu, 3 Feb 2011 12:02:39 +0000 Subject: [PATCH] added stereo_multi_gpu sample, cosmetic changes in multi_gpu sample --- modules/gpu/src/cuda/stereobm.cu | 2 +- samples/gpu/multi.cpp | 54 +++++---- samples/gpu/stereo_multi.cpp | 183 +++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 22 deletions(-) create mode 100644 samples/gpu/stereo_multi.cpp diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu index ca6a6ab3a4..732c1622bc 100644 --- a/modules/gpu/src/cuda/stereobm.cu +++ b/modules/gpu/src/cuda/stereobm.cu @@ -357,7 +357,7 @@ extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); - cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); diff --git a/samples/gpu/multi.cpp b/samples/gpu/multi.cpp index 9063c97c68..381ef011e1 100644 --- a/samples/gpu/multi.cpp +++ b/samples/gpu/multi.cpp @@ -1,3 +1,6 @@ +/* This sample demonstrates the way you can perform independed tasks + on the different GPUs */ + // Disable some warnings which are caused with CUDA headers #pragma warning(disable: 4201 4408 4100) @@ -34,41 +37,50 @@ using namespace cv::gpu; struct Worker { void operator()(int device_id) const; }; void destroyContexts(); -#define cuSafeCall(code) if (code != CUDA_SUCCESS) { \ +#define safeCall(code) if (code != CUDA_SUCCESS) { \ cout << "CUDA driver API error: code " << code \ << ", file " << __FILE__ << ", line " << __LINE__ << endl; \ destroyContexts(); \ exit(-1); \ } - // Each GPU is associated with its own context CUcontext contexts[2]; - int main() { - if (getCudaEnabledDeviceCount() < 2) + int num_devices = getCudaEnabledDeviceCount(); + + if (num_devices < 2) { cout << "Two or more GPUs are required\n"; return -1; } - cuSafeCall(cuInit(0)); + for (int i = 0; i < num_devices; ++i) + { + if (!DeviceInfo(i).isCompatible()) + { + cout << "GPU module isn't built for GPU #" << i << " (" << DeviceInfo(i).name() << ")"; + return -1; + } + } + + safeCall(cuInit(0)); - // Create context for the first GPU + // Create context for GPU #0 CUdevice device; - cuSafeCall(cuDeviceGet(&device, 0)); - cuSafeCall(cuCtxCreate(&contexts[0], 0, device)); + safeCall(cuDeviceGet(&device, 0)); + safeCall(cuCtxCreate(&contexts[0], 0, device)); CUcontext prev_context; - cuSafeCall(cuCtxPopCurrent(&prev_context)); + safeCall(cuCtxPopCurrent(&prev_context)); - // Create context for the second GPU - cuSafeCall(cuDeviceGet(&device, 1)); - cuSafeCall(cuCtxCreate(&contexts[1], 0, device)); + // Create context for GPU #1 + safeCall(cuDeviceGet(&device, 1)); + safeCall(cuCtxCreate(&contexts[1], 0, device)); - cuSafeCall(cuCtxPopCurrent(&prev_context)); + safeCall(cuCtxPopCurrent(&prev_context)); // Execute calculation in two threads using two GPUs int devices[] = {0, 1}; @@ -81,8 +93,8 @@ int main() void Worker::operator()(int device_id) const { - // Set proper context - cuSafeCall(cuCtxPushCurrent(contexts[device_id])); + // Set the proper context + safeCall(cuCtxPushCurrent(contexts[device_id])); Mat src(1000, 1000, CV_32F); Mat dst; @@ -93,15 +105,15 @@ void Worker::operator()(int device_id) const // CPU works transpose(src, dst); + // GPU works GpuMat d_src(src); GpuMat d_dst; - - // GPU works transpose(d_src, d_dst); // Check results bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3; - cout << "GPU #" << device_id << ": "<< (passed ? "passed" : "FAILED") << endl; + cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): " + << (passed ? "passed" : "FAILED") << endl; // Deallocate data here, otherwise deallocation will be performed // after context is extracted from the stack @@ -109,14 +121,14 @@ void Worker::operator()(int device_id) const d_dst.release(); CUcontext prev_context; - cuSafeCall(cuCtxPopCurrent(&prev_context)); + safeCall(cuCtxPopCurrent(&prev_context)); } void destroyContexts() { - cuSafeCall(cuCtxDestroy(contexts[0])); - cuSafeCall(cuCtxDestroy(contexts[1])); + safeCall(cuCtxDestroy(contexts[0])); + safeCall(cuCtxDestroy(contexts[1])); } #endif \ No newline at end of file diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp new file mode 100644 index 0000000000..975690c3c0 --- /dev/null +++ b/samples/gpu/stereo_multi.cpp @@ -0,0 +1,183 @@ +/* This sample demonstrates working on one piece of data using two GPUs. + It splits input into two parts and processes them separately on different + GPUs. */ + +// Disable some warnings which are caused with CUDA headers +#pragma warning(disable: 4201 4408 4100) + +#include +#include +#include +#include +#include + +#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) + +int main() +{ +#if !defined(HAVE_CUDA) + cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n"; +#endif + +#if !defined(HAVE_TBB) + cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n"; +#endif + + return 0; +} + +#else + +#include +#include +#include "opencv2/core/internal.hpp" // For TBB wrappers + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +struct Worker { void operator()(int device_id) const; }; +void destroyContexts(); + +#define safeCall(code) if (code != CUDA_SUCCESS) { \ + cout << "CUDA driver API error: code " << code \ + << ", file " << __FILE__ << ", line " << __LINE__ << endl; \ + destroyContexts(); \ + exit(-1); \ +} + +// Each GPU is associated with its own context +CUcontext contexts[2]; + +void inline contextOn(int id) +{ + safeCall(cuCtxPushCurrent(contexts[id])); +} + +void inline contextOff() +{ + CUcontext prev_context; + safeCall(cuCtxPopCurrent(&prev_context)); +} + +GpuMat d_left[2]; +GpuMat d_right[2]; +StereoBM_GPU* bm[2]; +GpuMat d_result[2]; +Mat result; + +int main(int argc, char** argv) +{ + if (argc < 3) + { + cout << "Usage: stereo_multi_gpu \n"; + return -1; + } + + int num_devices = getCudaEnabledDeviceCount(); + + if (num_devices < 2) + { + cout << "Two or more GPUs are required\n"; + return -1; + } + + for (int i = 0; i < num_devices; ++i) + { + if (!DeviceInfo(i).isCompatible()) + { + cout << "GPU module isn't built for GPU #" << i << " (" << DeviceInfo(i).name() << ")"; + return -1; + } + } + + // Load input data + Mat left = imread(argv[1], CV_LOAD_IMAGE_GRAYSCALE); + Mat right = imread(argv[2], CV_LOAD_IMAGE_GRAYSCALE); + if (left.empty()) + { + cout << "Cannot open '" << argv[1] << "'\n"; + return -1; + } + if (right.empty()) + { + cout << "Cannot open '" << argv[2] << "'\n"; + return -1; + } + + safeCall(cuInit(0)); + + // Create context for the first GPU + CUdevice device; + safeCall(cuDeviceGet(&device, 0)); + safeCall(cuCtxCreate(&contexts[0], 0, device)); + contextOff(); + + // Create context for the second GPU + safeCall(cuDeviceGet(&device, 1)); + safeCall(cuCtxCreate(&contexts[1], 0, device)); + contextOff(); + + // Split source images for processing on the first GPU + contextOn(0); + d_left[0].upload(left.rowRange(0, left.rows / 2)); + d_right[0].upload(right.rowRange(0, right.rows / 2)); + bm[0] = new StereoBM_GPU(); + contextOff(); + + // Split source images for processing on the second GPU + contextOn(1); + d_left[1].upload(left.rowRange(left.rows / 2, left.rows)); + d_right[1].upload(right.rowRange(right.rows / 2, right.rows)); + bm[1] = new StereoBM_GPU(); + contextOff(); + + // Execute calculation in two threads using two GPUs + int devices[] = {0, 1}; + parallel_do(devices, devices + 2, Worker()); + + // Release the first GPU resources + contextOn(0); + imshow("GPU #0 result", Mat(d_result[0])); + d_left[0].release(); + d_right[0].release(); + d_result[0].release(); + delete bm[0]; + contextOff(); + + // Release the second GPU resources + contextOn(1); + imshow("GPU #1 result", Mat(d_result[1])); + d_left[1].release(); + d_right[1].release(); + d_result[1].release(); + delete bm[1]; + contextOff(); + + waitKey(); + destroyContexts(); + return 0; +} + + +void Worker::operator()(int device_id) const +{ + contextOn(device_id); + + bm[device_id]->operator()(d_left[device_id], d_right[device_id], + d_result[device_id]); + + cout << "GPU #" << device_id << " (" << DeviceInfo().name() + << "): finished\n"; + + contextOff(); +} + + +void destroyContexts() +{ + safeCall(cuCtxDestroy(contexts[0])); + safeCall(cuCtxDestroy(contexts[1])); +} + +#endif