diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp index cf42043bbe..231bc072f3 100644 --- a/samples/gpu/stereo_multi.cpp +++ b/samples/gpu/stereo_multi.cpp @@ -1,149 +1,496 @@ -/* This sample demonstrates working on one piece of data using two GPUs. - It splits input into two parts and processes them separately on different - GPUs. */ +// This sample demonstrates working on one piece of data using two GPUs. +// It splits input into two parts and processes them separately on different GPUs. -// Disable some warnings which are caused with CUDA headers -#if defined(_MSC_VER) -#pragma warning(disable: 4201 4408 4100) +#ifdef WIN32 + #define NOMINMAX + #include +#else + #include + #include #endif #include -#include "cvconfig.h" +#include + #include "opencv2/core/core.hpp" #include "opencv2/highgui/highgui.hpp" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/contrib/contrib.hpp" #include "opencv2/gpu/gpu.hpp" -#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) +using namespace std; +using namespace cv; +using namespace cv::gpu; + +/////////////////////////////////////////////////////////// +// Thread +// OS-specific wrappers for multi-threading -int main() +#ifdef WIN32 +class Thread { -#if !defined(HAVE_CUDA) - std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n"; -#endif + struct UserData + { + void (*func)(void* userData); + void* param; + }; + + static DWORD WINAPI WinThreadFunction(LPVOID lpParam) + { + UserData* userData = static_cast(lpParam); + + userData->func(userData->param); + + return 0; + } + + UserData userData_; + HANDLE thread_; + DWORD threadId_; + +public: + Thread(void (*func)(void* userData), void* userData) + { + userData_.func = func; + userData_.param = userData; + + thread_ = CreateThread( + NULL, // default security attributes + 0, // use default stack size + WinThreadFunction, // thread function name + &userData_, // argument to thread function + 0, // use default creation flags + &threadId_); // returns the thread identifier + } + + ~Thread() + { + CloseHandle(thread_); + } + + void wait() + { + WaitForSingleObject(thread_, INFINITE); + } +}; +#else +class Thread +{ + struct UserData + { + void (*func)(void* userData); + void* param; + }; + + static void* PThreadFunction(void* lpParam) + { + UserData* userData = static_cast(lpParam); + + userData->func(userData->param); + + return 0; + } + + pthread_t thread_; + UserData userData_; + +public: + Thread(void (*func)(void* userData), void* userData) + { + userData_.func = func; + userData_.param = userData; + + pthread_create(&thread_, NULL, PThreadFunction, &userData_); + } -#if !defined(HAVE_TBB) - std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n"; + ~Thread() + { + pthread_detach(thread_); + } + + void wait() + { + pthread_join(thread_, NULL); + } +}; #endif - return 0; +/////////////////////////////////////////////////////////// +// StereoSingleGpu +// Run Stereo algorithm on single GPU + +class StereoSingleGpu +{ +public: + explicit StereoSingleGpu(int deviceId = 0); + ~StereoSingleGpu(); + + void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity); + +private: + int deviceId_; + GpuMat d_leftFrame; + GpuMat d_rightFrame; + GpuMat d_disparity; + Ptr d_alg; +}; + +StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId) +{ + gpu::setDevice(deviceId_); + d_alg = new StereoBM_GPU(StereoBM_GPU::BASIC_PRESET, 256); } -#else +StereoSingleGpu::~StereoSingleGpu() +{ + gpu::setDevice(deviceId_); + d_leftFrame.release(); + d_rightFrame.release(); + d_disparity.release(); + d_alg.release(); +} -#include "opencv2/core/internal.hpp" // For TBB wrappers +void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity) +{ + gpu::setDevice(deviceId_); + d_leftFrame.upload(leftFrame); + d_rightFrame.upload(rightFrame); + (*d_alg)(d_leftFrame, d_rightFrame, d_disparity); + d_disparity.download(disparity); +} -using namespace std; -using namespace cv; -using namespace cv::gpu; +/////////////////////////////////////////////////////////// +// StereoMultiGpuThread +// Run Stereo algorithm on two GPUs using different host threads + +class StereoMultiGpuThread +{ +public: + StereoMultiGpuThread(); + ~StereoMultiGpuThread(); + + void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity); + +private: + GpuMat d_leftFrames[2]; + GpuMat d_rightFrames[2]; + GpuMat d_disparities[2]; + Ptr d_algs[2]; + + struct StereoLaunchData + { + int deviceId; + Mat leftFrame; + Mat rightFrame; + Mat disparity; + GpuMat* d_leftFrame; + GpuMat* d_rightFrame; + GpuMat* d_disparity; + Ptr d_alg; + }; + + static void launchGpuStereoAlg(void* userData); +}; + +StereoMultiGpuThread::StereoMultiGpuThread() +{ + gpu::setDevice(0); + d_algs[0] = new StereoBM_GPU(StereoBM_GPU::BASIC_PRESET, 256); + + gpu::setDevice(1); + d_algs[1] = new StereoBM_GPU(StereoBM_GPU::BASIC_PRESET, 256); +} + +StereoMultiGpuThread::~StereoMultiGpuThread() +{ + gpu::setDevice(0); + d_leftFrames[0].release(); + d_rightFrames[0].release(); + d_disparities[0].release(); + d_algs[0].release(); + + gpu::setDevice(1); + d_leftFrames[1].release(); + d_rightFrames[1].release(); + d_disparities[1].release(); + d_algs[1].release(); +} + +void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity) +{ + disparity.create(leftFrame.size(), CV_8UC1); + + // Split input data onto two parts for each GPUs. + // We add small border for each part, + // because original algorithm doesn't calculate disparity on image borders. + // With such padding we will get output in the middle of final result. + + StereoLaunchData launchDatas[2]; + + launchDatas[0].deviceId = 0; + launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32); + launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32); + launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2); + launchDatas[0].d_leftFrame = &d_leftFrames[0]; + launchDatas[0].d_rightFrame = &d_rightFrames[0]; + launchDatas[0].d_disparity = &d_disparities[0]; + launchDatas[0].d_alg = d_algs[0]; + + launchDatas[1].deviceId = 1; + launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows); + launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows); + launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows); + launchDatas[1].d_leftFrame = &d_leftFrames[1]; + launchDatas[1].d_rightFrame = &d_rightFrames[1]; + launchDatas[1].d_disparity = &d_disparities[1]; + launchDatas[1].d_alg = d_algs[1]; -struct Worker { void operator()(int device_id) const; }; + Thread thread0(launchGpuStereoAlg, &launchDatas[0]); + Thread thread1(launchGpuStereoAlg, &launchDatas[1]); -// GPUs data -GpuMat d_left[2]; -GpuMat d_right[2]; -StereoBM_GPU* bm[2]; -GpuMat d_result[2]; + thread0.wait(); + thread1.wait(); +} -static void printHelp() +void StereoMultiGpuThread::launchGpuStereoAlg(void* userData) { - std::cout << "Usage: stereo_multi_gpu --left --right \n"; + StereoLaunchData* data = static_cast(userData); + + gpu::setDevice(data->deviceId); + data->d_leftFrame->upload(data->leftFrame); + data->d_rightFrame->upload(data->rightFrame); + (*data->d_alg)(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity); + + if (data->deviceId == 0) + data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity); + else + data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity); } +/////////////////////////////////////////////////////////// +// StereoMultiGpuStream +// Run Stereo algorithm on two GPUs from single host thread using async API + +class StereoMultiGpuStream +{ +public: + StereoMultiGpuStream(); + ~StereoMultiGpuStream(); + + void compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity); + +private: + GpuMat d_leftFrames[2]; + GpuMat d_rightFrames[2]; + GpuMat d_disparities[2]; + Ptr d_algs[2]; + Ptr streams[2]; +}; + +StereoMultiGpuStream::StereoMultiGpuStream() +{ + gpu::setDevice(0); + d_algs[0] = new StereoBM_GPU(StereoBM_GPU::BASIC_PRESET, 256); + streams[0] = new Stream; + + gpu::setDevice(1); + d_algs[1] = new StereoBM_GPU(StereoBM_GPU::BASIC_PRESET, 256); + streams[1] = new Stream; +} + +StereoMultiGpuStream::~StereoMultiGpuStream() +{ + gpu::setDevice(0); + d_leftFrames[0].release(); + d_rightFrames[0].release(); + d_disparities[0].release(); + d_algs[0].release(); + streams[0].release(); + + gpu::setDevice(1); + d_leftFrames[1].release(); + d_rightFrames[1].release(); + d_disparities[1].release(); + d_algs[1].release(); + streams[1].release(); +} + +void StereoMultiGpuStream::compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity) +{ + disparity.create(leftFrame.size(), CV_8UC1); + + // Split input data onto two parts for each GPUs. + // We add small border for each part, + // because original algorithm doesn't calculate disparity on image borders. + // With such padding we will get output in the middle of final result. + + Mat leftFrameHdr = leftFrame.createMatHeader(); + Mat rightFrameHdr = rightFrame.createMatHeader(); + Mat disparityHdr = disparity.createMatHeader(); + Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2); + Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows); + + gpu::setDevice(0); + streams[0]->enqueueUpload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), d_leftFrames[0]); + streams[0]->enqueueUpload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), d_rightFrames[0]); + (*d_algs[0])(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]); + streams[0]->enqueueDownload(d_disparities[0].rowRange(0, leftFrame.rows / 2), disparityPart0); + + gpu::setDevice(1); + streams[1]->enqueueUpload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), d_leftFrames[1]); + streams[1]->enqueueUpload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), d_rightFrames[1]); + (*d_algs[1])(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]); + streams[1]->enqueueDownload(d_disparities[1].rowRange(32, d_disparities[1].rows), disparityPart1); + + gpu::setDevice(0); + streams[0]->waitForCompletion(); + + gpu::setDevice(1); + streams[1]->waitForCompletion(); +} + +/////////////////////////////////////////////////////////// +// main + int main(int argc, char** argv) { - if (argc < 5) + if (argc != 3) { - printHelp(); + cerr << "Usage: stereo_multi_gpu " << endl; return -1; } - int num_devices = getCudaEnabledDeviceCount(); - if (num_devices < 2) + const int numDevices = getCudaEnabledDeviceCount(); + if (numDevices != 2) { - std::cout << "Two or more GPUs are required\n"; + cerr << "Two GPUs are required" << endl; return -1; } - for (int i = 0; i < num_devices; ++i) - { - cv::gpu::printShortCudaDeviceInfo(i); - DeviceInfo dev_info(i); - if (!dev_info.isCompatible()) + for (int i = 0; i < numDevices; ++i) + { + DeviceInfo devInfo(i); + if (!devInfo.isCompatible()) { - std::cout << "GPU module isn't built for GPU #" << i << " (" - << dev_info.name() << ", CC " << dev_info.majorVersion() - << dev_info.minorVersion() << "\n"; + cerr << "GPU module was't built for GPU #" << i << " (" + << devInfo.name() << ", CC " << devInfo.majorVersion() + << devInfo.minorVersion() << endl; return -1; } + + printShortCudaDeviceInfo(i); } - // Load input data - Mat left, right; - for (int i = 1; i < argc; ++i) + VideoCapture leftVideo(argv[1]); + VideoCapture rightVideo(argv[2]); + + if (!leftVideo.isOpened()) { - if (string(argv[i]) == "--left") - { - left = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE); - CV_Assert(!left.empty()); - } - else if (string(argv[i]) == "--right") - { - right = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE); - CV_Assert(!right.empty()); - } - else if (string(argv[i]) == "--help") + cerr << "Can't open " << argv[1] << " video file" << endl; + return -1; + } + + if (!rightVideo.isOpened()) + { + cerr << "Can't open " << argv[2] << " video file" << endl; + return -1; + } + + cout << endl; + cout << "This sample demonstrates working on one piece of data using two GPUs." << endl; + cout << "It splits input into two parts and processes them separately on different GPUs." << endl; + cout << endl; + + Mat leftFrame, rightFrame; + CudaMem leftGrayFrame, rightGrayFrame; + + StereoSingleGpu gpu0Alg(0); + StereoSingleGpu gpu1Alg(1); + StereoMultiGpuThread multiThreadAlg; + StereoMultiGpuStream multiStreamAlg; + + Mat disparityGpu0; + Mat disparityGpu1; + Mat disparityMultiThread; + CudaMem disparityMultiStream; + + Mat disparityGpu0Show; + Mat disparityGpu1Show; + Mat disparityMultiThreadShow; + Mat disparityMultiStreamShow; + + TickMeter tm; + + cout << "-------------------------------------------------------------------" << endl; + cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl; + cout << "-------------------------------------------------------------------" << endl; + + for (int i = 0;; ++i) + { + leftVideo >> leftFrame; + rightVideo >> rightFrame; + + if (leftFrame.empty() || rightFrame.empty()) + break; + + if (leftFrame.size() != rightFrame.size()) { - printHelp(); + cerr << "Frames have different sizes" << endl; return -1; } - } - // Split source images for processing on the GPU #0 - setDevice(0); - d_left[0].upload(left.rowRange(0, left.rows / 2)); - d_right[0].upload(right.rowRange(0, right.rows / 2)); - bm[0] = new StereoBM_GPU(); - - // Split source images for processing on the GPU #1 - setDevice(1); - d_left[1].upload(left.rowRange(left.rows / 2, left.rows)); - d_right[1].upload(right.rowRange(right.rows / 2, right.rows)); - bm[1] = new StereoBM_GPU(); - - // Execute calculation in two threads using two GPUs - int devices[] = {0, 1}; - parallel_do(devices, devices + 2, Worker()); - - // Release the first GPU resources - setDevice(0); - imshow("GPU #0 result", Mat(d_result[0])); - d_left[0].release(); - d_right[0].release(); - d_result[0].release(); - delete bm[0]; - - // Release the second GPU resources - setDevice(1); - imshow("GPU #1 result", Mat(d_result[1])); - d_left[1].release(); - d_right[1].release(); - d_result[1].release(); - delete bm[1]; - - waitKey(); - return 0; -} + leftGrayFrame.create(leftFrame.size(), CV_8UC1); + rightGrayFrame.create(leftFrame.size(), CV_8UC1); + cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY); + cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY); -void Worker::operator()(int device_id) const -{ - setDevice(device_id); + tm.reset(); tm.start(); + gpu0Alg.compute(leftGrayFrame, rightGrayFrame, disparityGpu0); + tm.stop(); - bm[device_id]->operator()(d_left[device_id], d_right[device_id], - d_result[device_id]); + const double gpu0Time = tm.getTimeMilli(); - std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() - << "): finished\n"; -} + tm.reset(); tm.start(); + gpu1Alg.compute(leftGrayFrame, rightGrayFrame, disparityGpu1); + tm.stop(); -#endif + const double gpu1Time = tm.getTimeMilli(); + + tm.reset(); tm.start(); + multiThreadAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiThread); + tm.stop(); + + const double multiThreadTime = tm.getTimeMilli(); + + tm.reset(); tm.start(); + multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream); + tm.stop(); + + const double multiStreamTime = tm.getTimeMilli(); + + cout << "| " << setw(5) << i << " | " + << setw(8) << setprecision(1) << fixed << gpu0Time << " | " + << setw(8) << setprecision(1) << fixed << gpu1Time << " | " + << setw(15) << setprecision(1) << fixed << multiThreadTime << " | " + << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl; + + resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA); + resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA); + resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA); + resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA); + + imshow("disparityGpu0", disparityGpu0Show); + imshow("disparityGpu1", disparityGpu1Show); + imshow("disparityMultiThread", disparityMultiThreadShow); + imshow("disparityMultiStream", disparityMultiStreamShow); + + const int key = waitKey(30) & 0xff; + if (key == 27) + break; + } + + cout << "-------------------------------------------------------------------" << endl; + + return 0; +}