From 48dec9c03afc4c9ab26b536e713d072f4840c007 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Sat, 24 Sep 2011 05:58:29 +0000
Subject: [PATCH] Fixed GPU version of multi-band blending in stitching module

---
 modules/gpu/src/cuda/element_operations.cu    | 19 +++---------
 modules/gpu/src/element_operations.cpp        |  7 +++--
 .../opencv2/stitching/detail/blenders.hpp     |  2 +-
 modules/stitching/src/blenders.cpp            | 31 +++++++++++++++++--
 4 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 7548e901fd..70f3bab8e6 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -654,27 +654,18 @@ namespace cv { namespace gpu { namespace device
     //////////////////////////////////////////////////////////////////////////
     // multiply
 
-    struct add_16sc4 : binary_function<short4, short4, short4>
+    template <> struct TransformFunctorTraits< plus<short> > : DefaultTransformFunctorTraits< plus<short> >
     {
-        __device__ __forceinline__ short4 operator ()(short4 a, short4 b) const
-        {
-            return make_short4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-        }
-    };
-
-    template <> struct TransformFunctorTraits<add_16sc4> : DefaultTransformFunctorTraits<add_16sc4>
-    {
-        enum { smart_block_dim_x = 8 };
         enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
+        enum { smart_shift = 4 };
     };
 
-    void add_gpu(const DevMem2D_<short4>& src1, const DevMem2D_<short4>& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)
+    template <typename T> void add_gpu(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
     {
-        transform(static_cast< DevMem2D_<short4> >(src1), static_cast< DevMem2D_<short4> >(src2), 
-                  static_cast< DevMem2D_<short4> >(dst), add_16sc4(), stream);
+        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, plus<T>(), stream);
     }
 
+    template void add_gpu<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
 
     //////////////////////////////////////////////////////////////////////////
     // multiply
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 4cf857ed4b..1173803c98 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -177,16 +177,17 @@ namespace
 
 namespace cv { namespace gpu { namespace device
 {
-    void add_gpu(const DevMem2D_<short4>& src1, const DevMem2D_<short4>& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
+    template <typename T>
+    void add_gpu(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
 }}}
 
 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    if (src1.type() == CV_16SC4 && src2.type() == CV_16SC4)
+    if (src1.depth() == CV_16S && src2.depth() == CV_16S)
     {
         CV_Assert(src1.size() == src2.size());
         dst.create(src1.size(), src1.type());
-        device::add_gpu(src1, src2, dst, StreamAccessor::getStream(stream));
+        device::add_gpu<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
     }
     else
         nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
index e14744787e..b291f93fd0 100644
--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -114,11 +114,11 @@ void CV_EXPORTS normalizeUsingWeightMap(const Mat& weight, Mat& src);
 void CV_EXPORTS createWeightMap(const Mat& mask, float sharpness, Mat& weight);
 
 void CV_EXPORTS createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat>& pyr);
-
 void CV_EXPORTS createLaplacePyrGpu(const Mat &img, int num_levels, std::vector<Mat>& pyr);
 
 // Restores source image
 void CV_EXPORTS restoreImageFromLaplacePyr(std::vector<Mat>& pyr);
+void CV_EXPORTS restoreImageFromLaplacePyrGpu(std::vector<Mat>& pyr);
 
 } // namespace detail
 } // namespace cv
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index ccf2d02f41..957fe8d3d6 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -289,7 +289,10 @@ void MultiBandBlender::blend(Mat &dst, Mat &dst_mask)
     for (int i = 0; i <= num_bands_; ++i)
         normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
 
-    restoreImageFromLaplacePyr(dst_pyr_laplace_);
+    if (can_use_gpu_)
+        restoreImageFromLaplacePyrGpu(dst_pyr_laplace_);
+    else
+        restoreImageFromLaplacePyr(dst_pyr_laplace_);
 
     dst_ = dst_pyr_laplace_[0];
     dst_ = dst_(Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width));
@@ -346,6 +349,7 @@ void createLaplacePyr(const Mat &img, int num_levels, vector<Mat> &pyr)
     }
 }
 
+
 void createLaplacePyrGpu(const Mat &img, int num_levels, vector<Mat> &pyr)
 {
 #ifndef ANDROID
@@ -368,9 +372,10 @@ void createLaplacePyrGpu(const Mat &img, int num_levels, vector<Mat> &pyr)
 #endif
 }
 
+
 void restoreImageFromLaplacePyr(vector<Mat> &pyr)
 {
-    if (pyr.size() == 0)
+    if (pyr.empty())
         return;
     Mat tmp;
     for (size_t i = pyr.size() - 1; i > 0; --i)
@@ -380,5 +385,27 @@ void restoreImageFromLaplacePyr(vector<Mat> &pyr)
     }
 }
 
+
+void restoreImageFromLaplacePyrGpu(vector<Mat> &pyr)
+{
+#ifndef ANDROID
+    if (pyr.empty())
+        return;
+
+    vector<gpu::GpuMat> gpu_pyr(pyr.size());
+    for (size_t i = 0; i < pyr.size(); ++i)
+        gpu_pyr[i] = pyr[i];
+
+    gpu::GpuMat tmp;
+    for (size_t i = pyr.size() - 1; i > 0; --i)
+    {
+        gpu::pyrUp(gpu_pyr[i], tmp);
+        gpu::add(tmp, gpu_pyr[i - 1], gpu_pyr[i - 1]);
+    }
+
+    pyr[0] = gpu_pyr[0];
+#endif
+}
+
 } // namespace detail
 } // namespace cv