From 90ae1e3aeddb0f5a026362d878d223f34b3804e5 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Wed, 19 Jan 2011 10:54:58 +0000 Subject: [PATCH] refactored gpu module --- doc/gpu_initialization.tex | 35 +- modules/gpu/include/opencv2/gpu/gpu.hpp | 5 +- modules/gpu/src/cuda/imgproc.cu | 4 +- modules/gpu/src/cuda/matrix_reductions.cu | 501 +++++++++++----------- modules/gpu/src/initialization.cpp | 96 ++--- modules/gpu/src/matrix_reductions.cpp | 241 ++++++----- 6 files changed, 471 insertions(+), 411 deletions(-) diff --git a/doc/gpu_initialization.tex b/doc/gpu_initialization.tex index f4f8cac4a6..930ea167f1 100644 --- a/doc/gpu_initialization.tex +++ b/doc/gpu_initialization.tex @@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false. \end{description} -\cvCppFunc{gpu::checkPtxVersion} +\cvCppFunc{gpu::ptxVersionIs} Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false. -\cvdefCpp{template $<$unsigned int cmp\_op$>$\newline -bool checkPtxVersion(int major, int minor);} +\cvdefCpp{bool ptxVersionIs(int major, int minor);} \begin{description} -\cvarg{cmp\_op}{Comparison operation: +\cvarg{major}{Major compute capability version.} +\cvarg{minor}{Minor compute capability version.} +\end{description} + + +\cvCppFunc{gpu::ptxVersionIsLessOrEqual} +Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false. + +\cvdefCpp{bool ptxVersionIsLessOrEqual(int major, int minor);} \begin{description} -\cvarg{CMP\_EQ}{Return true, if at least one of GPU module PTX versions matches the given one, otherwise false} -\cvarg{CMP\_LT}{Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false} -\cvarg{CMP\_LE}{Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false} -\cvarg{CMP\_GT}{Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false} -\cvarg{CMP\_GE}{Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false} -\end{description}} -\cvarg{major}{Major CC version.} -\cvarg{minor}{Minor CC version.} +\cvarg{major}{Major compute capability version.} +\cvarg{minor}{Minor compute capability version.} +\end{description} + + +\cvCppFunc{gpu::ptxVersionIsGreaterOrEqual} +Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false. + +\cvdefCpp{bool ptxVersionIsGreaterOrEqual(int major, int minor);} +\begin{description} +\cvarg{major}{Major compute capability version.} +\cvarg{minor}{Minor compute capability version.} \end{description} diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 61f5d6608f..d754a98432 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -72,8 +72,9 @@ namespace cv CV_EXPORTS bool hasNativeDoubleSupport(int device); CV_EXPORTS bool hasAtomicsSupport(int device); - template - CV_EXPORTS bool checkPtxVersion(int major, int minor); + CV_EXPORTS bool ptxVersionIs(int major, int minor); + CV_EXPORTS bool ptxVersionIsLessOrEqual(int major, int minor); + CV_EXPORTS bool ptxVersionIsGreaterOrEqual(int major, int minor); //! Checks if the GPU module is PTX compatible with the given NVIDIA device CV_EXPORTS bool isCompatibleWith(int device); diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 7d3ff893b4..c48ee526de 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc ////////////////////////////// Column Sum ////////////////////////////////////// - __global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) + __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) { int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc dim3 threads(256); dim3 grid(divUp(src.cols, threads.x)); - column_sum_kernel_32F<<>>(src.cols, src.rows, src, dst); + column_sumKernel_32F<<>>(src.cols, src.rows, src, dst); cudaSafeCall(cudaThreadSynchronize()); } diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index b16b4956b1..0e45fa46c2 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc // Performs reduction in shared memory template - __device__ void sum_in_smem(volatile T* data, const uint tid) + __device__ void sumInSmem(volatile T* data, const uint tid) { T sum = data[tid]; @@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); @@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc // Returns required buffer sizes - void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows) + void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows) { dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); + estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * elem_size; bufrows = 2; } // Estimates device constants which are used in the kernels using specified thread configuration - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid) + __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } @@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval) + __global__ void minMaxKernel(const DevMem2D src, Mask mask, T* minval, T* maxval) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; @@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxval[tid] = mymax; __syncthreads(); - find_min_max_in_smem(sminval, smaxval, tid); + findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { @@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxval[tid] = maxval[idx]; __syncthreads(); - find_min_max_in_smem(sminval, smaxval, tid); + findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { @@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc template - void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) + void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); - min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); + minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) + void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); - min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); + minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*,double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*,double*, PtrStep); + template void minMaxCaller(const DevMem2D, double*, double*, PtrStep); template - __global__ void min_max_pass2_kernel(T* minval, T* maxval, int size) + __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; @@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxval[tid] = maxval[idx]; __syncthreads(); - find_min_max_in_smem(sminval, smaxval, tid); + findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { @@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc template - void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) + void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); - min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); - min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); + minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); + minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void minMaxMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template - void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) + void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); - min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); - min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); + minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); + minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); + template void minMaxMultipassCaller(const DevMem2D, double*, double*, PtrStep); } // namespace minmax @@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); @@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc // Returns required buffer sizes - void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows) { dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); + estimateThreadCfg(cols, rows, threads, grid); b1cols = grid.x * grid.y * elem_size; // For values b1rows = 2; b2cols = grid.x * grid.y * sizeof(int); // For locations @@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates device constants which are used in the kernels using specified thread configuration - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -469,8 +469,8 @@ namespace cv { namespace gpu { namespace mathfunc template - __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, - volatile uint* maxloc, const uint tid) + __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, + volatile uint* maxloc, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } @@ -489,8 +489,8 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, - uint* minloc, uint* maxloc) + __global__ void minMaxLocKernel(const DevMem2D src, Mask mask, T* minval, T* maxval, + uint* minloc, uint* maxloc) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; @@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc uint tid = threadIdx.y * blockDim.x + threadIdx.x; T mymin = numeric_limits_gpu::max(); - T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); + T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : + numeric_limits_gpu::min(); uint myminloc = 0; uint mymaxloc = 0; uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); @@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxloc[tid] = mymaxloc; __syncthreads(); - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxloc[tid] = maxloc[idx]; __syncthreads(); - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); if (tid == 0) { @@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc template - void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); - min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, + minloc_buf, maxloc_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template - void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, + void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); - min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, + minloc_buf, maxloc_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); // This kernel will be used only when compute capability is 1.0 template - __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) + __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; @@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc smaxloc[tid] = maxloc[idx]; __syncthreads(); - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); if (tid == 0) { @@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc template - void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); - min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, + minloc_buf, maxloc_buf); + minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMaskMultipassCaller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template - void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); - min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, + minloc_buf, maxloc_buf); + minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void minMaxLocMultipassCaller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); } // namespace minmaxloc @@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc __device__ uint blocks_finished = 0; - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); @@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc } - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) + void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows) { dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); + estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * sizeof(int); bufrows = 1; } - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count) + __global__ void countNonZeroKernel(const DevMem2D src, volatile uint* count) { __shared__ uint scount[nthreads]; @@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc scount[tid] = cnt; __syncthreads(); - sum_in_smem(scount, tid); + sumInSmem(scount, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; __syncthreads(); - sum_in_smem(scount, tid); + sumInSmem(scount, tid); if (tid == 0) { @@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc template - int count_non_zero_caller(const DevMem2D src, PtrStep buf) + int countNonZeroCaller(const DevMem2D src, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); uint* count_buf = (uint*)buf.ptr(0); - count_non_zero_kernel<256, T><<>>(src, count_buf); + countNonZeroKernel<256, T><<>>(src, count_buf); cudaSafeCall(cudaThreadSynchronize()); uint count; @@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc return count; } - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); + template int countNonZeroCaller(const DevMem2D, PtrStep); template - __global__ void count_non_zero_pass2_kernel(uint* count, int size) + __global__ void countNonZeroPass2Kernel(uint* count, int size) { __shared__ uint scount[nthreads]; uint tid = threadIdx.y * blockDim.x + threadIdx.x; @@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc scount[tid] = tid < size ? count[tid] : 0; __syncthreads(); - sum_in_smem(scount, tid); + sumInSmem(scount, tid); if (tid == 0) count[0] = scount[0]; @@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc template - int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) + int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); uint* count_buf = (uint*)buf.ptr(0); - count_non_zero_kernel<256, T><<>>(src, count_buf); - count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); + countNonZeroKernel<256, T><<>>(src, count_buf); + countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); uint count; @@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc return count; } - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); + template int countNonZeroMultipassCaller(const DevMem2D, PtrStep); } // namespace countnonzero @@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc const int threads_x = 32; const int threads_y = 8; - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(threads_x, threads_y); grid = dim3(divUp(cols, threads.x * threads.y), @@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc } - void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows) + void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows) { dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); + estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * sizeof(double) * cn; bufrows = 1; } - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc } template - __global__ void sum_kernel(const DevMem2D src, R* result) + __global__ void sumKernel(const DevMem2D src, R* result) { __shared__ R smem[nthreads]; @@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid] = sum; __syncthreads(); - sum_in_smem(smem, tid); + sumInSmem(smem, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; __syncthreads(); - sum_in_smem(smem, tid); + sumInSmem(smem, tid); if (tid == 0) { @@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_pass2_kernel(R* result, int size) + __global__ void sumPass2Kernel(R* result, int size) { __shared__ R smem[nthreads]; int tid = threadIdx.y * blockDim.x + threadIdx.x; @@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid] = tid < size ? result[tid] : 0; __syncthreads(); - sum_in_smem(smem, tid); + sumInSmem(smem, tid); if (tid == 0) result[0] = smem[0]; @@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec::vec_t* result) + __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec::vec_t* result) { typedef typename TypeVec::vec_t SrcType; typedef typename TypeVec::vec_t DstType; @@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + nthreads] = sum.y; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + nthreads] = res.y; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); if (tid == 0) { @@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_pass2_kernel_C2(typename TypeVec::vec_t* result, int size) + __global__ void sumPass2Kernel_C2(typename TypeVec::vec_t* result, int size) { typedef typename TypeVec::vec_t DstType; @@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + nthreads] = res.y; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); if (tid == 0) { @@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec::vec_t* result) + __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec::vec_t* result) { typedef typename TypeVec::vec_t SrcType; typedef typename TypeVec::vec_t DstType; @@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 2 * nthreads] = sum.z; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 2 * nthreads] = res.z; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); if (tid == 0) { @@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_pass2_kernel_C3(typename TypeVec::vec_t* result, int size) + __global__ void sumPass2Kernel_C3(typename TypeVec::vec_t* result, int size) { typedef typename TypeVec::vec_t DstType; @@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 2 * nthreads] = res.z; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); if (tid == 0) { @@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc } template - __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec::vec_t* result) + __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec::vec_t* result) { typedef typename TypeVec::vec_t SrcType; typedef typename TypeVec::vec_t DstType; @@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 3 * nthreads] = sum.w; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); + sumInSmem(smem + 3 * nthreads, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 3 * nthreads] = res.w; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); + sumInSmem(smem + 3 * nthreads, tid); if (tid == 0) { @@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void sum_pass2_kernel_C4(typename TypeVec::vec_t* result, int size) + __global__ void sumPass2Kernel_C4(typename TypeVec::vec_t* result, int size) { typedef typename TypeVec::vec_t DstType; @@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid + 3 * nthreads] = res.z; __syncthreads(); - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); + sumInSmem(smem, tid); + sumInSmem(smem + nthreads, tid); + sumInSmem(smem + 2 * nthreads, tid); + sumInSmem(smem + 3 * nthreads, tid); if (tid == 0) { @@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: - sum_kernel, threads_x * threads_y><<>>( + sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( + sumPass2Kernel<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); case 2: - sum_kernel_C2, threads_x * threads_y><<>>( + sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); case 3: - sum_kernel_C3, threads_x * threads_y><<>>( + sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); case 4: - sum_kernel_C4, threads_x * threads_y><<>>( + sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); } cudaSafeCall(cudaThreadSynchronize()); @@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sumMultipassCaller(const DevMem2D, PtrStep, double*, int); template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: - sum_kernel, threads_x * threads_y><<>>( + sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 2: - sum_kernel_C2, threads_x * threads_y><<>>( + sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 3: - sum_kernel_C3, threads_x * threads_y><<>>( + sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 4: - sum_kernel_C4, threads_x * threads_y><<>>( + sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; } @@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); + template void sumCaller(const DevMem2D, PtrStep, double*, int); template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: - sum_kernel, threads_x * threads_y><<>>( + sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( + sumPass2Kernel<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); break; case 2: - sum_kernel_C2, threads_x * threads_y><<>>( + sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); break; case 3: - sum_kernel_C3, threads_x * threads_y><<>>( + sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); break; case 4: - sum_kernel_C4, threads_x * threads_y><<>>( + sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); break; } @@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumMultipassCaller(const DevMem2D, PtrStep, double*, int); template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); + estimateThreadCfg(src.cols, src.rows, threads, grid); + setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: - sum_kernel, threads_x * threads_y><<>>( + sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 2: - sum_kernel_C2, threads_x * threads_y><<>>( + sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 3: - sum_kernel_C3, threads_x * threads_y><<>>( + sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; case 4: - sum_kernel_C4, threads_x * threads_y><<>>( + sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; } @@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); }}} \ No newline at end of file diff --git a/modules/gpu/src/initialization.cpp b/modules/gpu/src/initialization.cpp index a21fef4fbd..bb613319f4 100644 --- a/modules/gpu/src/initialization.cpp +++ b/modules/gpu/src/initialization.cpp @@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device) namespace { - template - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2); - - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) + struct ComparerEqual { - return lhs1 == rhs1 && lhs2 == rhs2; - } + bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const + { + return lhs1 == rhs1 && lhs2 == rhs2; + } + }; - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) - { - return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 > rhs2); - } - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) + struct ComparerLessOrEqual { - return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2); - } + bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const + { + return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); + } + }; - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) - { - return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 < rhs2); - } - - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) - { - return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); - } - - template <> - bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2) + struct ComparerGreaterOrEqual { - return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); - } -} + bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const + { + return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2); + } + }; -template -CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor) -{ + template + bool checkPtxVersion(int major, int minor, Comparer cmp) + { #ifdef OPENCV_GPU_CUDA_ARCH_10 - if (comparePairs(1, 0, major, minor)) return true; + if (cmp(1, 0, major, minor)) return true; #endif #ifdef OPENCV_GPU_CUDA_ARCH_11 - if (comparePairs(1, 1, major, minor)) return true; + if (cmp(1, 1, major, minor)) return true; #endif #ifdef OPENCV_GPU_CUDA_ARCH_12 - if (comparePairs(1, 2, major, minor)) return true; + if (cmp(1, 2, major, minor)) return true; #endif #ifdef OPENCV_GPU_CUDA_ARCH_13 - if (comparePairs(1, 3, major, minor)) return true; + if (cmp(1, 3, major, minor)) return true; #endif #ifdef OPENCV_GPU_CUDA_ARCH_20 - if (comparePairs(2, 0, major, minor)) return true; + if (cmp(2, 0, major, minor)) return true; #endif #ifdef OPENCV_GPU_CUDA_ARCH_21 - if (comparePairs(2, 1, major, minor)) return true; + if (cmp(2, 1, major, minor)) return true; #endif - return false; + return false; + } +} + + +CV_EXPORTS bool cv::gpu::ptxVersionIs(int major, int minor) +{ + return checkPtxVersion(major, minor, ComparerEqual()); } -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); -template CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor); +CV_EXPORTS bool cv::gpu::ptxVersionIsLessOrEqual(int major, int minor) +{ + return checkPtxVersion(major, minor, ComparerLessOrEqual()); +} + + +CV_EXPORTS bool cv::gpu::ptxVersionIsGreaterOrEqual(int major, int minor) +{ + return checkPtxVersion(major, minor, ComparerGreaterOrEqual()); +} CV_EXPORTS bool isCompatibleWith(int device) @@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device) int major, minor; getComputeCapability(device, major, minor); - return checkPtxVersion(major, minor); + return ptxVersionIsLessOrEqual(major, minor); } #endif diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp index d3b1534902..dd1d152cd3 100644 --- a/modules/gpu/src/matrix_reductions.cpp +++ b/modules/gpu/src/matrix_reductions.cpp @@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) namespace cv { namespace gpu { namespace mathfunc { template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn); namespace sum { - void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); + void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows); } }}} @@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) using namespace mathfunc; typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); - static const Caller callers[2][7] = - { { sum_multipass_caller, sum_multipass_caller, - sum_multipass_caller, sum_multipass_caller, - sum_multipass_caller, sum_multipass_caller, 0 }, - { sum_caller, sum_caller, - sum_caller, sum_caller, - sum_caller, sum_caller, 0 } }; - - Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); - ensureSizeIsEnough(bufSize, CV_8U, buf); - - Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; + + static Caller multipass_callers[7] = { + sumMultipassCaller, sumMultipassCaller, + sumMultipassCaller, sumMultipassCaller, + sumMultipassCaller, sumMultipassCaller, 0 }; + + static Caller singlepass_callers[7] = { + sumCaller, sumCaller, + sumCaller, sumCaller, + sumCaller, sumCaller, 0 }; + + Size buf_size; + sum::getBufSizeRequired(src.cols, src.rows, src.channels(), + buf_size.width, buf_size.height); + ensureSizeIsEnough(buf_size, CV_8U, buf); + + Caller* callers = multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = singlepass_callers; + + Caller caller = callers[src.depth()]; if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); double result[4]; @@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) using namespace mathfunc; typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); - static const Caller callers[2][7] = - { { sqsum_multipass_caller, sqsum_multipass_caller, - sqsum_multipass_caller, sqsum_multipass_caller, - sqsum_multipass_caller, sqsum_multipass_caller, 0 }, - { sqsum_caller, sqsum_caller, - sqsum_caller, sqsum_caller, - sqsum_caller, sqsum_caller, 0 } }; - - Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); - ensureSizeIsEnough(bufSize, CV_8U, buf); - - Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; + + static Caller multipass_callers[7] = { + sqrSumMultipassCaller, sqrSumMultipassCaller, + sqrSumMultipassCaller, sqrSumMultipassCaller, + sqrSumMultipassCaller, sqrSumMultipassCaller, 0 }; + + static Caller singlepass_callers[7] = { + sqrSumCaller, sqrSumCaller, + sqrSumCaller, sqrSumCaller, + sqrSumCaller, sqrSumCaller, 0 }; + + Caller* callers = multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = singlepass_callers; + + Size buf_size; + sum::getBufSizeRequired(src.cols, src.rows, src.channels(), + buf_size.width, buf_size.height); + ensureSizeIsEnough(buf_size, CV_8U, buf); + + Caller caller = callers[src.depth()]; if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); double result[4]; @@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) namespace cv { namespace gpu { namespace mathfunc { namespace minmax { - void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); + void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows); template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); + void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); template - void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); + void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); template - void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); + void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); template - void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); + void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); }}}} @@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); - static const Caller callers[2][7] = - { { min_max_multipass_caller, min_max_multipass_caller, - min_max_multipass_caller, min_max_multipass_caller, - min_max_multipass_caller, min_max_multipass_caller, 0 }, - { min_max_caller, min_max_caller, - min_max_caller, min_max_caller, - min_max_caller, min_max_caller, min_max_caller } }; + static Caller multipass_callers[7] = { + minMaxMultipassCaller, minMaxMultipassCaller, + minMaxMultipassCaller, minMaxMultipassCaller, + minMaxMultipassCaller, minMaxMultipassCaller, 0 }; - static const MaskedCaller masked_callers[2][7] = - { { min_max_mask_multipass_caller, min_max_mask_multipass_caller, - min_max_mask_multipass_caller, min_max_mask_multipass_caller, - min_max_mask_multipass_caller, min_max_mask_multipass_caller, 0 }, - { min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller } }; + static Caller singlepass_callers[7] = { + minMaxCaller, minMaxCaller, + minMaxCaller, minMaxCaller, + minMaxCaller, minMaxCaller, minMaxCaller }; + static MaskedCaller masked_multipass_callers[7] = { + minMaxMaskMultipassCaller, minMaxMaskMultipassCaller, + minMaxMaskMultipassCaller, minMaxMaskMultipassCaller, + minMaxMaskMultipassCaller, minMaxMaskMultipassCaller, 0 }; + + static MaskedCaller masked_singlepass_callers[7] = { + minMaxMaskCaller, minMaxMaskCaller, + minMaxMaskCaller, minMaxMaskCaller, + minMaxMaskCaller, minMaxMaskCaller, + minMaxMaskCaller }; CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); @@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp double minVal_; if (!minVal) minVal = &minVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_; - Size bufSize; - get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); - ensureSizeIsEnough(bufSize, CV_8U, buf); + Size buf_size; + getBufSizeRequired(src.cols, src.rows, src.elemSize(), buf_size.width, buf_size.height); + ensureSizeIsEnough(buf_size, CV_8U, buf); if (mask.empty()) { - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + Caller* callers = multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = singlepass_callers; + + Caller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, minVal, maxVal, buf); } else { - MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; + MaskedCaller* callers = masked_multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = masked_singlepass_callers; + + MaskedCaller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, mask, minVal, maxVal, buf); } @@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { - void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows); template - void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, + void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); template - void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); template - void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, + void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); template - void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); }}}} @@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - static const Caller callers[2][7] = - { { min_max_loc_multipass_caller, min_max_loc_multipass_caller, - min_max_loc_multipass_caller, min_max_loc_multipass_caller, - min_max_loc_multipass_caller, min_max_loc_multipass_caller, 0 }, - { min_max_loc_caller, min_max_loc_caller, - min_max_loc_caller, min_max_loc_caller, - min_max_loc_caller, min_max_loc_caller, min_max_loc_caller } }; - - static const MaskedCaller masked_callers[2][7] = - { { min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, - min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, - min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, 0 }, - { min_max_loc_mask_caller, min_max_loc_mask_caller, - min_max_loc_mask_caller, min_max_loc_mask_caller, - min_max_loc_mask_caller, min_max_loc_mask_caller, min_max_loc_mask_caller } }; + static Caller multipass_callers[7] = { + minMaxLocMultipassCaller, minMaxLocMultipassCaller, + minMaxLocMultipassCaller, minMaxLocMultipassCaller, + minMaxLocMultipassCaller, minMaxLocMultipassCaller, 0 }; + + static Caller singlepass_callers[7] = { + minMaxLocCaller, minMaxLocCaller, + minMaxLocCaller, minMaxLocCaller, + minMaxLocCaller, minMaxLocCaller, minMaxLocCaller }; + + static MaskedCaller masked_multipass_callers[7] = { + minMaxLocMaskMultipassCaller, minMaxLocMaskMultipassCaller, + minMaxLocMaskMultipassCaller, minMaxLocMaskMultipassCaller, + minMaxLocMaskMultipassCaller, minMaxLocMaskMultipassCaller, 0 }; + + static MaskedCaller masked_singlepass_callers[7] = { + minMaxLocMaskCaller, minMaxLocMaskCaller, + minMaxLocMaskCaller, minMaxLocMaskCaller, + minMaxLocMaskCaller, minMaxLocMaskCaller, + minMaxLocMaskCaller }; CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); @@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point int minLoc_[2]; int maxLoc_[2]; - Size valBufSize, locBufSize; - get_buf_size_required(src.cols, src.rows, src.elemSize(), valBufSize.width, - valBufSize.height, locBufSize.width, locBufSize.height); - ensureSizeIsEnough(valBufSize, CV_8U, valBuf); - ensureSizeIsEnough(locBufSize, CV_8U, locBuf); + Size valbuf_size, locbuf_size; + getBufSizeRequired(src.cols, src.rows, src.elemSize(), valbuf_size.width, + valbuf_size.height, locbuf_size.width, locbuf_size.height); + ensureSizeIsEnough(valbuf_size, CV_8U, valBuf); + ensureSizeIsEnough(locbuf_size, CV_8U, locBuf); if (mask.empty()) { - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + Caller* callers = multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = singlepass_callers; + + Caller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } else { - MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; + MaskedCaller* callers = masked_multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = masked_singlepass_callers; + + MaskedCaller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } @@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); + void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows); template - int count_non_zero_caller(const DevMem2D src, PtrStep buf); + int countNonZeroCaller(const DevMem2D src, PtrStep buf); template - int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf); + int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf); }}}} @@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) typedef int (*Caller)(const DevMem2D src, PtrStep buf); - static const Caller callers[2][7] = - { { count_non_zero_multipass_caller, count_non_zero_multipass_caller, - count_non_zero_multipass_caller, count_non_zero_multipass_caller, - count_non_zero_multipass_caller, count_non_zero_multipass_caller, 0}, - { count_non_zero_caller, count_non_zero_caller, - count_non_zero_caller, count_non_zero_caller, - count_non_zero_caller, count_non_zero_caller, count_non_zero_caller } }; + static Caller multipass_callers[7] = { + countNonZeroMultipassCaller, countNonZeroMultipassCaller, + countNonZeroMultipassCaller, countNonZeroMultipassCaller, + countNonZeroMultipassCaller, countNonZeroMultipassCaller, 0 }; + + static Caller singlepass_callers[7] = { + countNonZeroCaller, countNonZeroCaller, + countNonZeroCaller, countNonZeroCaller, + countNonZeroCaller, countNonZeroCaller, + countNonZeroCaller }; CV_Assert(src.channels() == 1); CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); - Size bufSize; - get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); - ensureSizeIsEnough(bufSize, CV_8U, buf); + Size buf_size; + getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height); + ensureSizeIsEnough(buf_size, CV_8U, buf); + + Caller* callers = multipass_callers; + if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice())) + callers = singlepass_callers; - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + Caller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); return caller(src, buf); }