From 1c9f4e7ca24aeaf2b01f7600ba439a0661dd87b9 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Tue, 10 May 2011 12:39:12 +0000
Subject: [PATCH] fixed gpu::meanStdDev and gpu::norm under CUDA 4.0 fixed
 compilation under Win64

---
 modules/gpu/src/matrix_operations.cpp | 10 ++++----
 modules/gpu/src/matrix_reductions.cpp | 37 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 6467ee7880..e1c34dda86 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -596,11 +596,11 @@ bool cv::gpu::CudaMem::canMapHostMemory()
 
 namespace
 {
-    int alignUp(int what, int alignment)
+    size_t alignUpStep(size_t what, size_t alignment)
     {
-        int alignMask = alignment-1;
-        int inverseAlignMask = ~alignMask;
-        int res = (what + alignMask) & inverseAlignMask;
+        size_t alignMask = alignment-1;
+        size_t inverseAlignMask = ~alignMask;
+        size_t res = (what + alignMask) & inverseAlignMask;
         return res;
     }
 }
@@ -626,7 +626,7 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
         {
             cudaDeviceProp prop;
             cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) );
-            step = alignUp(step, prop.textureAlignment);
+            step = alignUpStep(step, prop.textureAlignment);
         }
         int64 _nettosize = (int64)step*rows;
         size_t nettosize = (size_t)_nettosize;
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 1cc8831a52..0d4fae35e4 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -78,9 +78,28 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
     sz.width  = src.cols;
     sz.height = src.rows;
 
+#if NPP_VERSION_MAJOR >= 4
+
+    GpuMat d_buf(1, 2, CV_64F);
+
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, d_buf.ptr<double>(), d_buf.ptr<double>() + 1) );
+
+    cudaSafeCall( cudaThreadSynchronize() );
+
+    double buf[2];
+
+    Mat _buf(1, 2, CV_64F, buf);
+    d_buf.download(_buf);
+    mean[0] = buf[0];
+    stddev[0] = buf[1];
+
+#else
+    
     nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
 
     cudaSafeCall( cudaThreadSynchronize() );
+
+#endif
 }
 
 
@@ -131,14 +150,32 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
     sz.height = src1.rows;
 
     int funcIdx = normType >> 1;
+        
+#if NPP_VERSION_MAJOR >= 4
+
+    GpuMat d_buf(1, 1, CV_64F);
+
+    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
+        src2.ptr<Npp8u>(), src2.step,
+        sz, d_buf.ptr<double>()) );
+
+    cudaSafeCall( cudaThreadSynchronize() );
+    
     double retVal;
+    Mat _buf(1, 1, CV_64F, &retVal);
+    d_buf.download(_buf);
+
+#else
 
+    double retVal;
     nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
         src2.ptr<Npp8u>(), src2.step,
         sz, &retVal) );
 
     cudaSafeCall( cudaThreadSynchronize() );
 
+#endif   
+
     return retVal;
 }