From 9f80317ffa976f7795a2ad39919a9682622bec9d Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Fri, 10 Dec 2010 13:36:00 +0000
Subject: [PATCH] fixed minor bug in gpu module, added first version of sum

---
 modules/gpu/src/cuda/imgproc.cu    |   4 +-
 modules/gpu/src/cuda/mathfunc.cu   | 184 +++++++++++++++++++++++++++--
 modules/gpu/src/match_template.cpp |  11 --
 3 files changed, 179 insertions(+), 20 deletions(-)
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 143e8d09f1..4ba2a89797 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
 
 ////////////////////////////// Column Sum //////////////////////////////////////
 
-    __global__ void columnSumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
+    __global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
     {
         int x = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
         dim3 threads(256);
         dim3 grid(divUp(src.cols, threads.x));
 
-        columnSumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
+        column_sum_kernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
         cudaSafeCall(cudaThreadSynchronize());
     }
 
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index dde2b048e7..3d9ceda5bd 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -450,6 +450,8 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         threads = dim3(32, 8);
         grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
     }
 
 
@@ -662,7 +664,6 @@ namespace cv { namespace gpu { namespace mathfunc
         {
             minval[0] = (T)sminval[0];
             maxval[0] = (T)smaxval[0];
-            blocks_finished = 0;
         }
     }
 
@@ -744,6 +745,8 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         threads = dim3(32, 8);
         grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
     }
 
 
@@ -1005,7 +1008,6 @@ namespace cv { namespace gpu { namespace mathfunc
             maxval[0] = (T)smaxval[0];
             minloc[0] = sminloc[0];
             maxloc[0] = smaxloc[0];
-            blocks_finished = 0;
         }
     }
 
@@ -1102,6 +1104,8 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         threads = dim3(32, 8);
         grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
     }
 
 
@@ -1212,13 +1216,12 @@ namespace cv { namespace gpu { namespace mathfunc
         unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
         scount[tid] = tid < size ? count[tid] : 0;
-		sum_in_smem<nthreads, unsigned int>(scount, tid);
+        __syncthreads();
 
-		if (tid == 0) 
-        {
+        sum_in_smem<nthreads, unsigned int>(scount, tid);
+
+        if (tid == 0) 
             count[0] = scount[0];
-            blocks_finished = 0;
-        }
     }
 
 
@@ -1409,4 +1412,171 @@ namespace cv { namespace gpu { namespace mathfunc
     template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
     template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
     template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+//////////////////////////////////////////////////////////////////////////////
+// Sum
+
+    namespace sum 
+    {
+
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+    __device__ unsigned int blocks_finished = 0;
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
+    {
+        threads = dim3(threads_x, threads_y);
+        grid = dim3(divUp(cols, threads.x * threads.y), 
+                    divUp(rows, threads.y * threads.x));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
+    }
+
+
+    template <typename T>
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * sizeof(T);
+        bufrows = 1;
+    }
+
+
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+    }
+
+    template <typename T, int nthreads>
+    __global__ void sum_kernel(const DevMem2D_<T> src, T* result)
+    {
+        __shared__ T smem[nthreads];
+
+        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        T sum = 0;
+        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const T* ptr = src.ptr(y0 + y * blockDim.y);
+            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                sum += ptr[x0 + x * blockDim.x];
+        }
+
+        smem[tid] = sum;
+        __syncthreads();
+
+        sum_in_smem<nthreads, T>(smem, tid);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __shared__ bool is_last;
+
+        if (tid == 0)
+        {
+            result[bid] = smem[0];
+            __threadfence();
+
+            unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+
+        __syncthreads();
+
+        if (is_last)
+        {
+            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
+            __syncthreads();
+
+            sum_in_smem<nthreads, T>(smem, tid);
+
+            if (tid == 0) 
+            {
+                result[0] = smem[0];
+                blocks_finished = 0;
+            }
+        }
+#else
+        if (tid == 0) result[bid] = smem[0];
+#endif
+    }
+
+
+    template <typename T>
+    T sum_caller(const DevMem2D_<T> src, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+
+        T* buf_ = (T*)buf.ptr(0);
+
+        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
+        cudaSafeCall(cudaThreadSynchronize());
+
+        T sum;
+        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
+
+        return sum;
+    }  
+
+    template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
+    template char sum_caller<char>(const DevMem2D_<char>, PtrStep);
+    template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
+    template short sum_caller<short>(const DevMem2D_<short>, PtrStep);
+    template int sum_caller<int>(const DevMem2D_<int>, PtrStep);
+    template float sum_caller<float>(const DevMem2D_<float>, PtrStep);
+    template double sum_caller<double>(const DevMem2D_<double>, PtrStep);
+
+
+    template <typename T, int nthreads>
+    __global__ void sum_pass2_kernel(T* result, int size)
+    {
+        __shared__ T smem[nthreads];
+        int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        smem[tid] = tid < size ? result[tid] : 0;
+        sum_in_smem<nthreads, T>(smem, tid);
+
+        if (tid == 0) 
+            result[0] = smem[0];
+    }
+
+
+    template <typename T>
+    T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+
+        T* buf_ = (T*)buf.ptr(0);
+
+        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
+        sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                buf_, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+
+        T sum;
+        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
+        
+        return sum;
+    }  
+
+    template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
+    template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep);
+    template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
+    template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep);
+    template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep);
+    template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep);
+
+    } // namespace sum
 }}}
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp
index 7b85ac41f6..6972f8168b 100644
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -244,17 +244,6 @@ namespace
     {
         result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
         imgproc::matchTemplateNaive_8U_SQDIFF(image, templ, result);
-
-        //GpuMat image_sum;
-        //GpuMat image_sumsq;
-        //integral(image, image_sum, image_sumsq);
-
-        //float templ_sumsq = 0.f;
-
-        //matchTemplate_8U_CCORR(image, templ, result);
-
-        //imgproc::matchTemplatePrepared_8U_SQDIFF(
-        //        templ.cols, templ.rows, image_sumsq, templ_sumsq, result);
     }