From 9f82ac18d4756b43e779aebbc2a7e832775121b5 Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Tue, 24 Mar 2015 17:23:21 +0100
Subject: [PATCH 1/5] cudastereo: reprojectImageTo3D: enabled CV_32S and CV_32F
 disparity formats

This is to achieve parity with the CPU equivalent.
---
 modules/cudastereo/src/cuda/util.cu | 4 ++++
 modules/cudastereo/src/util.cpp     | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/cudastereo/src/cuda/util.cu b/modules/cudastereo/src/cuda/util.cu
index 432f46191d..6826f90f71 100644
--- a/modules/cudastereo/src/cuda/util.cu
+++ b/modules/cudastereo/src/cuda/util.cu
@@ -98,6 +98,10 @@ namespace cv { namespace cuda { namespace device
     template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
     template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
     template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<int, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<int, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<float, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<float, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
 
     /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
 
diff --git a/modules/cudastereo/src/util.cpp b/modules/cudastereo/src/util.cpp
index ba7c23f6b5..a39d28cb2c 100644
--- a/modules/cudastereo/src/util.cpp
+++ b/modules/cudastereo/src/util.cpp
@@ -66,16 +66,16 @@ void cv::cuda::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray
     using namespace cv::cuda::device;
 
     typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-    static const func_t funcs[2][4] =
+    static const func_t funcs[2][6] =
     {
-        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
-        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
+        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>, reprojectImageTo3D_gpu<int, float3>, reprojectImageTo3D_gpu<float, float3>},
+        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>, reprojectImageTo3D_gpu<int, float4>, reprojectImageTo3D_gpu<float, float4>}
     };
 
     GpuMat disp = _disp.getGpuMat();
     Mat Q = _Q.getMat();
 
-    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S || disp.type() == CV_32S || disp.type() == CV_32F );
     CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
     CV_Assert( dst_cn == 3 || dst_cn == 4 );
 

From bf5e930468b07a7e6054e7bc734d1f6a6b37596b Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Tue, 24 Mar 2015 18:10:22 +0100
Subject: [PATCH 2/5] cudastereo: drawColorDisp: enabled CV_32S and CV_32F
 disparity formats

---
 modules/cudastereo/src/cuda/util.cu | 51 +++++++++++++++++++++++++++++
 modules/cudastereo/src/util.cpp     |  6 ++--
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/modules/cudastereo/src/cuda/util.cu b/modules/cudastereo/src/cuda/util.cu
index 6826f90f71..b65c240ee2 100644
--- a/modules/cudastereo/src/cuda/util.cu
+++ b/modules/cudastereo/src/cuda/util.cu
@@ -205,6 +205,29 @@ namespace cv { namespace cuda { namespace device
         }
     }
 
+    __global__ void drawColorDisp(int* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uint *line = (uint*)(out_image + y * out_step);
+            line[x] = cvtPixel(disp[y*disp_step + x], ndisp);
+        }
+    }
+
+    __global__ void drawColorDisp(float* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uint *line = (uint*)(out_image + y * out_step);
+            line[x] = cvtPixel(disp[y*disp_step + x], ndisp);
+        }
+    }
 
     void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
     {
@@ -233,6 +256,34 @@ namespace cv { namespace cuda { namespace device
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
+
+    void drawColorDisp_gpu(const PtrStepSz<int>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(int), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void drawColorDisp_gpu(const PtrStepSz<float>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(float), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }}} // namespace cv { namespace cuda { namespace cudev
 
 
diff --git a/modules/cudastereo/src/util.cpp b/modules/cudastereo/src/util.cpp
index a39d28cb2c..09b108ca89 100644
--- a/modules/cudastereo/src/util.cpp
+++ b/modules/cudastereo/src/util.cpp
@@ -92,6 +92,8 @@ namespace cv { namespace cuda { namespace device
 {
     void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
     void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<int>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<float>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
 }}}
 
 namespace
@@ -111,11 +113,11 @@ namespace
 void cv::cuda::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
 {
     typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
-    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, drawColorDisp_caller<int>, drawColorDisp_caller<float>, 0, 0};
 
     GpuMat src = _src.getGpuMat();
 
-    CV_Assert( src.type() == CV_8U || src.type() == CV_16S );
+    CV_Assert( src.type() == CV_8U || src.type() == CV_16S || src.type() == CV_32S || src.type() == CV_32F );
 
     drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
 }

From 980d84e4a2722d1762cfc9e800536930a8a58d26 Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Tue, 24 Mar 2015 21:06:31 +0100
Subject: [PATCH 3/5] calib3d: improve documentation of reprojectImageTo3D

Make a note about 16-bit signed format - the function assumes that
values have no fractional bits (so 16-bit disparity from StereoBM
and StereoSGBM cannot be directly used!)
---
 modules/calib3d/include/opencv2/calib3d.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index f80b59c751..21fbf9ddda 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1375,7 +1375,8 @@ CV_EXPORTS_W void validateDisparity( InputOutputArray disparity, InputArray cost
 /** @brief Reprojects a disparity image to 3D space.
 
 @param disparity Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
-floating-point disparity image.
+floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
+fractional bits.
 @param _3dImage Output 3-channel floating-point image of the same size as disparity . Each
 element of _3dImage(x,y) contains 3D coordinates of the point (x,y) computed from the disparity
 map.

From 1c804124d170772166da5d898ffd20f31c44c91f Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Tue, 24 Mar 2015 21:07:46 +0100
Subject: [PATCH 4/5] cudastereo: updated documentation for reprojectImage3D
 and drawColorDisp

Updated the list of supported input formats, added note about 16-bit
signed format (no fractional bits).
---
 modules/cudastereo/include/opencv2/cudastereo.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/modules/cudastereo/include/opencv2/cudastereo.hpp b/modules/cudastereo/include/opencv2/cudastereo.hpp
index af265bb44f..0f16e176fd 100644
--- a/modules/cudastereo/include/opencv2/cudastereo.hpp
+++ b/modules/cudastereo/include/opencv2/cudastereo.hpp
@@ -295,7 +295,9 @@ CV_EXPORTS Ptr<cuda::DisparityBilateralFilter>
 
 /** @brief Reprojects a disparity image to 3D space.
 
-@param disp Input disparity image. CV_8U and CV_16S types are supported.
+@param disp Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
+floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
+fractional bits.
 @param xyzw Output 3- or 4-channel floating-point image of the same size as disp . Each element of
 xyzw(x,y) contains 3D coordinates (x,y,z) or (x,y,z,1) of the point (x,y) , computed from the
 disparity map.
@@ -309,8 +311,10 @@ CV_EXPORTS void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray
 
 /** @brief Colors a disparity image.
 
-@param src_disp Source disparity image. CV_8UC1 and CV_16SC1 types are supported.
-@param dst_disp Output disparity image. It has the same size as src_disp . The type is CV_8UC4
+@param src_disp Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
+floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
+fractional bits.
+@param dst_disp Output disparity image. It has the same size as src_disp. The type is CV_8UC4
 in BGRA format (alpha = 255).
 @param ndisp Number of disparities.
 @param stream Stream for the asynchronous version.

From 7452f9a56e2810fe9d7733f51e485a95edfa92c4 Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Tue, 24 Mar 2015 21:12:54 +0100
Subject: [PATCH 5/5] cudastereo: document lack of fractional bits in 16-bit
 signed format for StereoBeliefPropagation

---
 modules/cudastereo/include/opencv2/cudastereo.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/cudastereo/include/opencv2/cudastereo.hpp b/modules/cudastereo/include/opencv2/cudastereo.hpp
index 0f16e176fd..c734f5cca3 100644
--- a/modules/cudastereo/include/opencv2/cudastereo.hpp
+++ b/modules/cudastereo/include/opencv2/cudastereo.hpp
@@ -138,7 +138,8 @@ public:
     @param data User-specified data cost, a matrix of msg_type type and
     Size(\<image columns\>\*ndisp, \<image rows\>) size.
     @param disparity Output disparity map. If disparity is empty, the output type is CV_16SC1 .
-    Otherwise, the type is retained.
+    Otherwise, the type is retained. In 16-bit signed format, the disparity values do not have
+    fractional bits.
     @param stream Stream for the asynchronous version.
      */
     virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;