diff --git a/modules/cudaarithm/src/cuda/minmax.cu b/modules/cudaarithm/src/cuda/minmax.cu index c92de44ad3..eec861bdc4 100644 --- a/modules/cudaarithm/src/cuda/minmax.cu +++ b/modules/cudaarithm/src/cuda/minmax.cu @@ -40,208 +40,77 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" -#include "opencv2/core/cuda/vec_math.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/reduce.hpp" -#include "opencv2/core/cuda/emulation.hpp" -#include "opencv2/core/cuda/limits.hpp" -#include "opencv2/core/cuda/utility.hpp" +#ifndef HAVE_OPENCV_CUDEV -using namespace cv::cuda; -using namespace cv::cuda::device; +#error "opencv_cudev is required" -namespace minMax -{ - __device__ unsigned int blocks_finished = 0; - - // To avoid shared bank conflicts we convert each value into value of - // appropriate type (32 bits minimum) - template struct MinMaxTypeTraits; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef float best_type; }; - template <> struct MinMaxTypeTraits { typedef double best_type; }; - - template - struct GlobalReduce - { - static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval) - { - #if __CUDA_ARCH__ >= 200 - if (tid == 0) - { - Emulation::glob::atomicMin(minval, mymin); - Emulation::glob::atomicMax(maxval, mymax); - } - #else - __shared__ bool is_last; - - if (tid == 0) - { - minval[bid] = mymin; - maxval[bid] = mymax; - - __threadfence(); - - unsigned int ticket = ::atomicAdd(&blocks_finished, 1); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - int idx = ::min(tid, gridDim.x * gridDim.y - 1); - - mymin = minval[idx]; - mymax = maxval[idx]; - - const minimum minOp; - const maximum maxOp; - device::reduce(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp)); - - if (tid == 0) - { - minval[0] = mymin; - maxval[0] = mymax; - - blocks_finished = 0; - } - } - #endif - } - }; - - template - __global__ void kernel(const PtrStepSz src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight) - { - __shared__ R sminval[BLOCK_SIZE]; - __shared__ R smaxval[BLOCK_SIZE]; - - const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - R mymin = numeric_limits::max(); - R mymax = -numeric_limits::max(); - - const minimum minOp; - const maximum maxOp; - - for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y) - { - const T* ptr = src.ptr(y); +#else - for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x) - { - if (mask(y, x)) - { - const R srcVal = ptr[x]; +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" - mymin = minOp(mymin, srcVal); - mymax = maxOp(mymax, srcVal); - } - } - } +using namespace cv::cudev; - device::reduce(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp)); - - GlobalReduce::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval); - } - - const int threads_x = 32; - const int threads_y = 8; - - void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid) +namespace +{ + template + void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal) { - block = dim3(threads_x, threads_y); - - grid = dim3(divUp(cols, block.x * block.y), - divUp(rows, block.y * block.x)); + typedef typename SelectIf< + TypesEquals::value, + double, + typename SelectIf::value, float, int>::type + >::type work_type; - grid.x = ::min(grid.x, block.x); - grid.y = ::min(grid.y, block.y); - } + GpuMat_ src(_src); + GpuMat_ buf(_buf); - void getBufSize(int cols, int rows, int& bufcols, int& bufrows) - { - dim3 block, grid; - getLaunchCfg(cols, rows, block, grid); + if (mask.empty()) + gridFindMinMaxVal(src, buf); + else + gridFindMinMaxVal(src, buf, globPtr(mask)); - bufcols = grid.x * grid.y * sizeof(double); - bufrows = 2; - } + work_type data[2]; + buf.download(cv::Mat(1, 2, buf.type(), data)); - __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf) - { - *minval_buf = numeric_limits::max(); - *maxval_buf = numeric_limits::min(); - } - __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf) - { - *minval_buf = numeric_limits::max(); - *maxval_buf = -numeric_limits::max(); - } - __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf) - { - *minval_buf = numeric_limits::max(); - *maxval_buf = -numeric_limits::max(); - } + if (minVal) + *minVal = data[0]; - template - void setDefault(R* minval_buf, R* maxval_buf) - { - setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf); + if (maxVal) + *maxVal = data[1]; } +} - template - void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) +void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf) +{ + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal); + static const func_t funcs[] = { - typedef typename MinMaxTypeTraits::best_type R; - - dim3 block, grid; - getLaunchCfg(src.cols, src.rows, block, grid); - - const int twidth = divUp(divUp(src.cols, grid.x), block.x); - const int theight = divUp(divUp(src.rows, grid.y), block.y); - - R* minval_buf = (R*) buf.ptr(0); - R* maxval_buf = (R*) buf.ptr(1); + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl + }; - setDefault(minval_buf, maxval_buf); + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); - if (mask.data) - kernel<<>>((PtrStepSz) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight); - else - kernel<<>>((PtrStepSz) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight); + CV_Assert( src.channels() == 1 ); + CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); - cudaSafeCall( cudaGetLastError() ); + const int depth = src.depth(); - cudaSafeCall( cudaDeviceSynchronize() ); + const int work_type = depth == CV_64F ? CV_64F : depth == CV_32F ? CV_32F : CV_32S; + ensureSizeIsEnough(1, 2, work_type, buf); - R minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - } + const func_t func = funcs[src.depth()]; - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); + func(src, mask, buf, minVal, maxVal); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index d4e53b77f5..e3a8d6d7a1 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -186,53 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT return retVal; } -//////////////////////////////////////////////////////////////////////// -// minMax - -namespace minMax -{ - void getBufSize(int cols, int rows, int& bufcols, int& bufrows); - - template - void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); -} - -void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); - - typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); - static const func_t funcs[] = - { - ::minMax::run, - ::minMax::run, - ::minMax::run, - ::minMax::run, - ::minMax::run, - ::minMax::run, - ::minMax::run - }; - - CV_Assert( src.channels() == 1 ); - CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); - - if (src.depth() == CV_64F) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - } - - Size buf_size; - ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height); - ensureSizeIsEnough(buf_size, CV_8U, buf); - - const func_t func = funcs[src.depth()]; - - double temp1, temp2; - func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf); -} - //////////////////////////////////////////////////////////////////////// // minMaxLoc diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp index c220a94214..21a95eace2 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp @@ -440,30 +440,24 @@ namespace grid_reduce_detail __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { typedef typename PtrTraits::value_type src_type; - const int cn = VecTraits::cn; - typedef typename MakeVec::type work_type; - glob_reduce, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream); + glob_reduce, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream); } template __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { typedef typename PtrTraits::value_type src_type; - const int cn = VecTraits::cn; - typedef typename MakeVec::type work_type; - glob_reduce, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream); + glob_reduce, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream); } template __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { typedef typename PtrTraits::value_type src_type; - const int cn = VecTraits::cn; - typedef typename MakeVec::type work_type; - glob_reduce, Policy>(src, result, mask, rows, cols, stream); + glob_reduce, Policy>(src, result, mask, rows, cols, stream); } }