From 9f80317ffa976f7795a2ad39919a9682622bec9d Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Fri, 10 Dec 2010 13:36:00 +0000 Subject: [PATCH] fixed minor bug in gpu module, added first version of sum --- modules/gpu/src/cuda/imgproc.cu | 4 +- modules/gpu/src/cuda/mathfunc.cu | 184 +++++++++++++++++++++++++++-- modules/gpu/src/match_template.cpp | 11 -- 3 files changed, 179 insertions(+), 20 deletions(-) diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 143e8d09f1..4ba2a89797 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc ////////////////////////////// Column Sum ////////////////////////////////////// - __global__ void columnSumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) + __global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) { int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc dim3 threads(256); dim3 grid(divUp(src.cols, threads.x)); - columnSumKernel_32F<<>>(src.cols, src.rows, src, dst); + column_sum_kernel_32F<<>>(src.cols, src.rows, src, dst); cudaSafeCall(cudaThreadSynchronize()); } diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index dde2b048e7..3d9ceda5bd 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -450,6 +450,8 @@ namespace cv { namespace gpu { namespace mathfunc { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); } @@ -662,7 +664,6 @@ namespace cv { namespace gpu { namespace mathfunc { minval[0] = (T)sminval[0]; maxval[0] = (T)smaxval[0]; - blocks_finished = 0; } } @@ -744,6 +745,8 @@ namespace cv { namespace gpu { namespace mathfunc { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); } @@ -1005,7 +1008,6 @@ namespace cv { namespace gpu { namespace mathfunc maxval[0] = (T)smaxval[0]; minloc[0] = sminloc[0]; maxloc[0] = smaxloc[0]; - blocks_finished = 0; } } @@ -1102,6 +1104,8 @@ namespace cv { namespace gpu { namespace mathfunc { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); } @@ -1212,13 +1216,12 @@ namespace cv { namespace gpu { namespace mathfunc unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; scount[tid] = tid < size ? count[tid] : 0; - sum_in_smem(scount, tid); + __syncthreads(); - if (tid == 0) - { + sum_in_smem(scount, tid); + + if (tid == 0) count[0] = scount[0]; - blocks_finished = 0; - } } @@ -1409,4 +1412,171 @@ namespace cv { namespace gpu { namespace mathfunc template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + +////////////////////////////////////////////////////////////////////////////// +// Sum + + namespace sum + { + + __constant__ int ctwidth; + __constant__ int ctheight; + __device__ unsigned int blocks_finished = 0; + + const int threads_x = 32; + const int threads_y = 8; + + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + { + threads = dim3(threads_x, threads_y); + grid = dim3(divUp(cols, threads.x * threads.y), + divUp(rows, threads.y * threads.x)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); + } + + + template + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) + { + dim3 threads, grid; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * sizeof(T); + bufrows = 1; + } + + + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); + } + + template + __global__ void sum_kernel(const DevMem2D_ src, T* result) + { + __shared__ T smem[nthreads]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + T sum = 0; + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const T* ptr = src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + sum += ptr[x0 + x * blockDim.x]; + } + + smem[tid] = sum; + __syncthreads(); + + sum_in_smem(smem, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + result[bid] = smem[0]; + __threadfence(); + + unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; + __syncthreads(); + + sum_in_smem(smem, tid); + + if (tid == 0) + { + result[0] = smem[0]; + blocks_finished = 0; + } + } +#else + if (tid == 0) result[bid] = smem[0]; +#endif + } + + + template + T sum_caller(const DevMem2D_ src, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* buf_ = (T*)buf.ptr(0); + + sum_kernel<<>>(src, buf_); + cudaSafeCall(cudaThreadSynchronize()); + + T sum; + cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); + + return sum; + } + + template unsigned char sum_caller(const DevMem2D_, PtrStep); + template char sum_caller(const DevMem2D_, PtrStep); + template unsigned short sum_caller(const DevMem2D_, PtrStep); + template short sum_caller(const DevMem2D_, PtrStep); + template int sum_caller(const DevMem2D_, PtrStep); + template float sum_caller(const DevMem2D_, PtrStep); + template double sum_caller(const DevMem2D_, PtrStep); + + + template + __global__ void sum_pass2_kernel(T* result, int size) + { + __shared__ T smem[nthreads]; + int tid = threadIdx.y * blockDim.x + threadIdx.x; + + smem[tid] = tid < size ? result[tid] : 0; + sum_in_smem(smem, tid); + + if (tid == 0) + result[0] = smem[0]; + } + + + template + T sum_multipass_caller(const DevMem2D_ src, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* buf_ = (T*)buf.ptr(0); + + sum_kernel<<>>(src, buf_); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + buf_, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + T sum; + cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); + + return sum; + } + + template unsigned char sum_multipass_caller(const DevMem2D_, PtrStep); + template char sum_multipass_caller(const DevMem2D_, PtrStep); + template unsigned short sum_multipass_caller(const DevMem2D_, PtrStep); + template short sum_multipass_caller(const DevMem2D_, PtrStep); + template int sum_multipass_caller(const DevMem2D_, PtrStep); + template float sum_multipass_caller(const DevMem2D_, PtrStep); + + } // namespace sum }}} diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index 7b85ac41f6..6972f8168b 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -244,17 +244,6 @@ namespace { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); imgproc::matchTemplateNaive_8U_SQDIFF(image, templ, result); - - //GpuMat image_sum; - //GpuMat image_sumsq; - //integral(image, image_sum, image_sumsq); - - //float templ_sumsq = 0.f; - - //matchTemplate_8U_CCORR(image, templ, result); - - //imgproc::matchTemplatePrepared_8U_SQDIFF( - // templ.cols, templ.rows, image_sumsq, templ_sumsq, result); }