diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp index 4ccaa70e1c..c89e00346a 100644 --- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp @@ -142,6 +142,10 @@ private: Rect dst_roi_final_; bool can_use_gpu_; int weight_type_; //CV_32F or CV_16S +#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) + std::vector gpu_dst_pyr_laplace_; + std::vector gpu_dst_band_weights_; +#endif }; diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp index 1d2fe9e597..dc7aecb08c 100644 --- a/modules/stitching/src/blenders.cpp +++ b/modules/stitching/src/blenders.cpp @@ -43,6 +43,23 @@ #include "precomp.hpp" #include "opencl_kernels_stitching.hpp" +#ifdef HAVE_CUDA + namespace cv { namespace cuda { namespace device + { + namespace blend + { + void addSrcWeightGpu16S(const PtrStep src, const PtrStep src_weight, + PtrStep dst, PtrStep dst_weight, cv::Rect &rc); + void addSrcWeightGpu32F(const PtrStep src, const PtrStepf src_weight, + PtrStep dst, PtrStepf dst_weight, cv::Rect &rc); + void normalizeUsingWeightMapGpu16S(const PtrStep weight, PtrStep src, + const int width, const int height); + void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep src, + const int width, const int height); + } + }}} +#endif + namespace cv { namespace detail { @@ -228,21 +245,46 @@ void MultiBandBlender::prepare(Rect dst_roi) Blender::prepare(dst_roi); - dst_pyr_laplace_.resize(num_bands_ + 1); - dst_pyr_laplace_[0] = dst_; +#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) + if (can_use_gpu_) + { + gpu_dst_pyr_laplace_.resize(num_bands_ + 1); + gpu_dst_pyr_laplace_[0].create(dst_roi.size(), CV_16SC3); + gpu_dst_pyr_laplace_[0].setTo(Scalar::all(0)); - dst_band_weights_.resize(num_bands_ + 1); - dst_band_weights_[0].create(dst_roi.size(), weight_type_); - dst_band_weights_[0].setTo(0); + gpu_dst_band_weights_.resize(num_bands_ + 1); + gpu_dst_band_weights_[0].create(dst_roi.size(), weight_type_); + gpu_dst_band_weights_[0].setTo(0); - for (int i = 1; i <= num_bands_; ++i) + for (int i = 1; i <= num_bands_; ++i) + { + gpu_dst_pyr_laplace_[i].create((gpu_dst_pyr_laplace_[i - 1].rows + 1) / 2, + (gpu_dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3); + gpu_dst_band_weights_[i].create((gpu_dst_band_weights_[i - 1].rows + 1) / 2, + (gpu_dst_band_weights_[i - 1].cols + 1) / 2, weight_type_); + gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0)); + gpu_dst_band_weights_[i].setTo(0); + } + } + else +#endif { - dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2, - (dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3); - dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2, - (dst_band_weights_[i - 1].cols + 1) / 2, weight_type_); - dst_pyr_laplace_[i].setTo(Scalar::all(0)); - dst_band_weights_[i].setTo(0); + dst_pyr_laplace_.resize(num_bands_ + 1); + dst_pyr_laplace_[0] = dst_; + + dst_band_weights_.resize(num_bands_ + 1); + dst_band_weights_[0].create(dst_roi.size(), weight_type_); + dst_band_weights_[0].setTo(0); + + for (int i = 1; i <= num_bands_; ++i) + { + dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2, + (dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3); + dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2, + (dst_band_weights_[i - 1].cols + 1) / 2, weight_type_); + dst_pyr_laplace_[i].setTo(Scalar::all(0)); + dst_band_weights_[i].setTo(0); + } } } @@ -312,6 +354,76 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl) int bottom = br_new.y - tl.y - img.rows; int right = br_new.x - tl.x - img.cols; +#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) + if (can_use_gpu_) + { + // Create the source image Laplacian pyramid + cuda::GpuMat gpu_img; + gpu_img.upload(img); + cuda::GpuMat img_with_border; + cuda::copyMakeBorder(gpu_img, img_with_border, top, bottom, left, right, BORDER_REFLECT); + std::vector gpu_src_pyr_laplace(num_bands_ + 1); + img_with_border.convertTo(gpu_src_pyr_laplace[0], CV_16S); + for (int i = 0; i < num_bands_; ++i) + cuda::pyrDown(gpu_src_pyr_laplace[i], gpu_src_pyr_laplace[i + 1]); + for (int i = 0; i < num_bands_; ++i) + { + cuda::GpuMat up; + cuda::pyrUp(gpu_src_pyr_laplace[i + 1], up); + cuda::subtract(gpu_src_pyr_laplace[i], up, gpu_src_pyr_laplace[i]); + } + + // Create the weight map Gaussian pyramid + cuda::GpuMat gpu_mask; + gpu_mask.upload(mask); + cuda::GpuMat weight_map; + std::vector gpu_weight_pyr_gauss(num_bands_ + 1); + + if (weight_type_ == CV_32F) + { + gpu_mask.convertTo(weight_map, CV_32F, 1. / 255.); + } + else // weight_type_ == CV_16S + { + gpu_mask.convertTo(weight_map, CV_16S); + cuda::GpuMat add_mask; + cuda::compare(gpu_mask, 0, add_mask, CMP_NE); + cuda::add(weight_map, Scalar::all(1), weight_map, add_mask); + } + cuda::copyMakeBorder(weight_map, gpu_weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT); + for (int i = 0; i < num_bands_; ++i) + cuda::pyrDown(gpu_weight_pyr_gauss[i], gpu_weight_pyr_gauss[i + 1]); + + int y_tl = tl_new.y - dst_roi_.y; + int y_br = br_new.y - dst_roi_.y; + int x_tl = tl_new.x - dst_roi_.x; + int x_br = br_new.x - dst_roi_.x; + + // Add weighted layer of the source image to the final Laplacian pyramid layer + for (int i = 0; i <= num_bands_; ++i) + { + Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl); + cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace[i]; + cuda::GpuMat _dst_pyr_laplace = gpu_dst_pyr_laplace_[i](rc); + cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss[i]; + cuda::GpuMat _dst_band_weights = gpu_dst_band_weights_[i](rc); + + using namespace cv::cuda::device::blend; + if (weight_type_ == CV_32F) + { + addSrcWeightGpu32F(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc); + } + else + { + addSrcWeightGpu16S(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc); + } + x_tl /= 2; y_tl /= 2; + x_br /= 2; y_br /= 2; + } + return; + } +#endif + // Create the source image Laplacian pyramid UMat img_with_border; copyMakeBorder(_img, img_with_border, top, bottom, left, right, @@ -322,10 +434,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl) #endif std::vector src_pyr_laplace; - if (can_use_gpu_ && img_with_border.depth() == CV_16S) - createLaplacePyrGpu(img_with_border, num_bands_, src_pyr_laplace); - else - createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace); + createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace); LOGLN(" Create the source image Laplacian pyramid, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec"); #if ENABLE_LOG @@ -431,20 +540,57 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl) void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask) { - for (int i = 0; i <= num_bands_; ++i) - normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]); - + cv::UMat dst_band_weights_0; + Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height); +#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) if (can_use_gpu_) - restoreImageFromLaplacePyrGpu(dst_pyr_laplace_); + { + for (int i = 0; i <= num_bands_; ++i) + { + cuda::GpuMat dst_i = gpu_dst_pyr_laplace_[i]; + cuda::GpuMat weight_i = gpu_dst_band_weights_[i]; + + using namespace ::cv::cuda::device::blend; + if (weight_type_ == CV_32F) + { + normalizeUsingWeightMapGpu32F(weight_i, dst_i, weight_i.cols, weight_i.rows); + } + else + { + normalizeUsingWeightMapGpu16S(weight_i, dst_i, weight_i.cols, weight_i.rows); + } + } + + // Restore image from Laplacian pyramid + for (size_t i = num_bands_; i > 0; --i) + { + cuda::GpuMat up; + cuda::pyrUp(gpu_dst_pyr_laplace_[i], up); + cuda::add(up, gpu_dst_pyr_laplace_[i - 1], gpu_dst_pyr_laplace_[i - 1]); + } + + gpu_dst_pyr_laplace_[0](dst_rc).download(dst_); + gpu_dst_band_weights_[0].download(dst_band_weights_0); + + gpu_dst_pyr_laplace_.clear(); + gpu_dst_band_weights_.clear(); + } else +#endif + { + for (int i = 0; i <= num_bands_; ++i) + normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]); + restoreImageFromLaplacePyr(dst_pyr_laplace_); - Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height); - dst_ = dst_pyr_laplace_[0](dst_rc); - UMat _dst_mask; - compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT); - dst_pyr_laplace_.clear(); - dst_band_weights_.clear(); + dst_ = dst_pyr_laplace_[0](dst_rc); + dst_band_weights_0 = dst_band_weights_[0]; + + dst_pyr_laplace_.clear(); + dst_band_weights_.clear(); + } + + compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT); Blender::blend(dst, dst_mask); } diff --git a/modules/stitching/src/cuda/multiband_blend.cu b/modules/stitching/src/cuda/multiband_blend.cu new file mode 100644 index 0000000000..daa0005ec0 --- /dev/null +++ b/modules/stitching/src/cuda/multiband_blend.cu @@ -0,0 +1,112 @@ +#if !defined CUDA_DISABLER + +#include "opencv2/core/cuda/common.hpp" +#include "opencv2/core/types.hpp" + +namespace cv { namespace cuda { namespace device +{ + namespace blend + { + __global__ void addSrcWeightKernel16S(const PtrStep src, const PtrStep src_weight, + PtrStep dst, PtrStep dst_weight, int rows, int cols) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < rows && x < cols) + { + const short3 v = ((const short3*)src.ptr(y))[x]; + short w = src_weight.ptr(y)[x]; + ((short3*)dst.ptr(y))[x].x += short((v.x * w) >> 8); + ((short3*)dst.ptr(y))[x].y += short((v.y * w) >> 8); + ((short3*)dst.ptr(y))[x].z += short((v.z * w) >> 8); + dst_weight.ptr(y)[x] += w; + } + } + + void addSrcWeightGpu16S(const PtrStep src, const PtrStep src_weight, + PtrStep dst, PtrStep dst_weight, cv::Rect &rc) + { + dim3 threads(16, 16); + dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y)); + addSrcWeightKernel16S<<>>(src, src_weight, dst, dst_weight, rc.height, rc.width); + cudaSafeCall(cudaGetLastError()); + } + + __global__ void addSrcWeightKernel32F(const PtrStep src, const PtrStepf src_weight, + PtrStep dst, PtrStepf dst_weight, int rows, int cols) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < rows && x < cols) + { + const short3 v = ((const short3*)src.ptr(y))[x]; + float w = src_weight.ptr(y)[x]; + ((short3*)dst.ptr(y))[x].x += static_cast(v.x * w); + ((short3*)dst.ptr(y))[x].y += static_cast(v.y * w); + ((short3*)dst.ptr(y))[x].z += static_cast(v.z * w); + dst_weight.ptr(y)[x] += w; + } + } + + void addSrcWeightGpu32F(const PtrStep src, const PtrStepf src_weight, + PtrStep dst, PtrStepf dst_weight, cv::Rect &rc) + { + dim3 threads(16, 16); + dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y)); + addSrcWeightKernel32F<<>>(src, src_weight, dst, dst_weight, rc.height, rc.width); + cudaSafeCall(cudaGetLastError()); + } + + __global__ void normalizeUsingWeightKernel16S(const PtrStep weight, PtrStep src, + const int width, const int height) + { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < width && y < height) + { + const short3 v = ((short3*)src.ptr(y))[x]; + short w = weight.ptr(y)[x]; + ((short3*)src.ptr(y))[x] = make_short3(short((v.x << 8) / w), + short((v.y << 8) / w), short((v.z << 8) / w)); + } + } + + void normalizeUsingWeightMapGpu16S(const PtrStep weight, PtrStep src, + const int width, const int height) + { + dim3 threads(16, 16); + dim3 grid(divUp(width, threads.x), divUp(height, threads.y)); + normalizeUsingWeightKernel16S<<>> (weight, src, width, height); + } + + __global__ void normalizeUsingWeightKernel32F(const PtrStepf weight, PtrStep src, + const int width, const int height) + { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < width && y < height) + { + static const float WEIGHT_EPS = 1e-5f; + const short3 v = ((short3*)src.ptr(y))[x]; + float w = weight.ptr(y)[x]; + ((short3*)src.ptr(y))[x] = make_short3(static_cast(v.x / (w + WEIGHT_EPS)), + static_cast(v.y / (w + WEIGHT_EPS)), + static_cast(v.z / (w + WEIGHT_EPS))); + } + } + + void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep src, + const int width, const int height) + { + dim3 threads(16, 16); + dim3 grid(divUp(width, threads.x), divUp(height, threads.y)); + normalizeUsingWeightKernel32F<<>> (weight, src, width, height); + } + } +}}} + +#endif