From ec70282bf75341db1d60c9f4a19328d716709044 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Thu, 25 Apr 2013 15:23:44 +0400
Subject: [PATCH] switched to Input/Output Array in min/max operations

---
 .../gpuarithm/include/opencv2/gpuarithm.hpp   |  18 +-
 modules/gpuarithm/src/element_operations.cpp  | 247 ++++++------------
 modules/gpuimgproc/src/hough.cpp              |   2 +-
 modules/nonfree/src/surf_gpu.cpp              |   2 +-
 4 files changed, 82 insertions(+), 187 deletions(-)
diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
index 943b3a1d8a..42d69ef946 100644
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@@ -115,6 +115,12 @@ CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
 CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
 
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
 //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
 CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
                             int dtype = -1, Stream& stream = Stream::Null());
@@ -125,18 +131,6 @@ static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2
     addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
 }
 
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of array and scalar (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of array and scalar (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
 //! implements generalized matrix product algorithm GEMM from BLAS
 CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
     const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
diff --git a/modules/gpuarithm/src/element_operations.cpp b/modules/gpuarithm/src/element_operations.cpp
index 5a0f206aed..425b699a04 100644
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
@@ -48,46 +48,30 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
 void cv::gpu::add(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::subtract(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::absdiff(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::abs(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::sqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::sqrt(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::exp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::log(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::pow(InputArray, double, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::compare(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::bitwise_not(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::bitwise_or(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::bitwise_and(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::bitwise_xor(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::rshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::gpu::lshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::min(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::max(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
 
@@ -2262,6 +2246,15 @@ void cv::gpu::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
+namespace
+{
+    enum
+    {
+        MIN_OP,
+        MAX_OP
+    };
+}
+
 namespace arithm
 {
     void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
@@ -2275,116 +2268,49 @@ namespace arithm
     template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int op)
 {
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
+    static const func_t funcs[2][7] =
     {
-        minMat<unsigned char>,
-        minMat<signed char>,
-        minMat<unsigned short>,
-        minMat<short>,
-        minMat<int>,
-        minMat<float>,
-        minMat<double>
-    };
-
-    const int depth = src1.depth();
-    const int cn = src1.channels();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (depth == CV_8U || depth == CV_16U)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
         {
-            if (depth == CV_8U && (src1_.cols & 3) == 0)
-            {
-                const int vcols = src1_.cols >> 2;
-
-                minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
-
-                return;
-            }
-            else if (depth == CV_16U && (src1_.cols & 1) == 0)
-            {
-                const int vcols = src1_.cols >> 1;
-
-                minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
-
-                return;
-            }
+            minMat<unsigned char>,
+            minMat<signed char>,
+            minMat<unsigned short>,
+            minMat<short>,
+            minMat<int>,
+            minMat<float>,
+            minMat<double>
+        },
+        {
+            maxMat<unsigned char>,
+            maxMat<signed char>,
+            maxMat<unsigned short>,
+            maxMat<short>,
+            maxMat<int>,
+            maxMat<float>,
+            maxMat<double>
         }
-    }
-
-    const func_t func = funcs[depth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src1_, src2_, dst_, stream);
-}
-
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
-{
-    using namespace arithm;
+    };
 
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
+    typedef void (*opt_func_t)(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    static const opt_func_t funcs_v4[2] =
+    {
+        minMat_v4, maxMat_v4
+    };
+    static const opt_func_t funcs_v2[2] =
     {
-        maxMat<unsigned char>,
-        maxMat<signed char>,
-        maxMat<unsigned short>,
-        maxMat<short>,
-        maxMat<int>,
-        maxMat<float>,
-        maxMat<double>
+        minMat_v2, maxMat_v2
     };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
 
     CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -2404,10 +2330,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
             {
                 const int vcols = src1_.cols >> 2;
 
-                maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                funcs_v4[op](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                             stream);
 
                 return;
             }
@@ -2415,17 +2341,17 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
             {
                 const int vcols = src1_.cols >> 1;
 
-                maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                funcs_v2[op](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                             stream);
 
                 return;
             }
         }
     }
 
-    const func_t func = funcs[depth];
+    const func_t func = funcs[op][depth];
 
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -2441,20 +2367,31 @@ namespace
     }
 }
 
-void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
+void minMaxScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
 {
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
+    static const func_t funcs[2][7] =
     {
-        minScalar<unsigned char>,
-        minScalar<signed char>,
-        minScalar<unsigned short>,
-        minScalar<short>,
-        minScalar<int>,
-        minScalar<float>,
-        minScalar<double>
+        {
+            minScalar<unsigned char>,
+            minScalar<signed char>,
+            minScalar<unsigned short>,
+            minScalar<short>,
+            minScalar<int>,
+            minScalar<float>,
+            minScalar<double>
+        },
+        {
+            maxScalar<unsigned char>,
+            maxScalar<signed char>,
+            maxScalar<unsigned short>,
+            maxScalar<short>,
+            maxScalar<int>,
+            maxScalar<float>,
+            maxScalar<double>
+        }
     };
 
     typedef double (*cast_func_t)(double sc);
@@ -2468,53 +2405,17 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
     CV_Assert( depth <= CV_64F );
     CV_Assert( src.channels() == 1 );
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+    funcs[op][depth](src, cast_func[depth](val[0]), dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
+void cv::gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        maxScalar<unsigned char>,
-        maxScalar<signed char>,
-        maxScalar<unsigned short>,
-        maxScalar<short>,
-        maxScalar<int>,
-        maxScalar<float>,
-        maxScalar<double>
-    };
-
-    typedef double (*cast_func_t)(double sc);
-    static const cast_func_t cast_func[] =
-    {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
-    };
-
-    const int depth = src.depth();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src.channels() == 1 );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MIN_OP);
+}
 
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+void cv::gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MAX_OP);
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpuimgproc/src/hough.cpp b/modules/gpuimgproc/src/hough.cpp
index bc0a8a400d..15e5297623 100644
--- a/modules/gpuimgproc/src/hough.cpp
+++ b/modules/gpuimgproc/src/hough.cpp
@@ -761,7 +761,7 @@ namespace
         {
             buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
                             r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            min(r_sizes, maxSize, r_sizes);
+            gpu::min(r_sizes, maxSize, r_sizes);
         }
     }
 
diff --git a/modules/nonfree/src/surf_gpu.cpp b/modules/nonfree/src/surf_gpu.cpp
index ace9bb53ab..82ade2927e 100644
--- a/modules/nonfree/src/surf_gpu.cpp
+++ b/modules/nonfree/src/surf_gpu.cpp
@@ -147,7 +147,7 @@ namespace
 
             if (use_mask)
             {
-                min(mask, 1.0, surf_.mask1);
+                gpu::min(mask, 1.0, surf_.mask1);
                 gpu::integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
                 maskOffset = bindMaskSumTex(surf_.maskSum);
             }