diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 7377f52578..1b09f4c777 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -860,6 +860,9 @@ namespace cv //! counts non-zero array elements CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); + //! reduces a matrix to a vector + CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null()); + ///////////////////////////// Calibration 3D ////////////////////////////////// diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 60e6c886e8..722be26187 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -1804,4 +1804,278 @@ namespace cv { namespace gpu { namespace mathfunc template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller(const DevMem2D, PtrStep, double*, int); + + ////////////////////////////////////////////////////////////////////////////// + // reduce + + template struct SumReductor + { + __device__ __forceinline__ S startValue() const + { + return 0; + } + + __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const + { + return a + b; + } + + __device__ __forceinline S result(S r, double) const + { + return r; + } + }; + + template struct AvgReductor + { + __device__ __forceinline__ S startValue() const + { + return 0; + } + + __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const + { + return a + b; + } + + __device__ __forceinline double result(S r, double sz) const + { + return r / sz; + } + }; + + template struct MinReductor + { + __device__ __forceinline__ S startValue() const + { + return numeric_limits::max(); + } + + template __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const + { + return saturate_cast(::min(a, b)); + } + __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const + { + return ::fmin(a, b); + } + + __device__ __forceinline S result(S r, double) const + { + return r; + } + }; + + template struct MaxReductor + { + __device__ __forceinline__ S startValue() const + { + return numeric_limits::min(); + } + + template __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const + { + return ::max(a, b); + } + __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const + { + return ::fmax(a, b); + } + + __device__ __forceinline S result(S r, double) const + { + return r; + } + }; + + template __global__ void reduceRows(const DevMem2D_ src, D* dst, const Op op) + { + __shared__ S smem[16 * 16]; + + const int x = blockIdx.x * 16 + threadIdx.x; + + if (x < src.cols) + { + S myVal = op.startValue(); + + for (int y = threadIdx.y; y < src.rows; y += 16) + myVal = op(myVal, src.ptr(y)[x]); + + smem[threadIdx.y * 16 + threadIdx.x] = myVal; + __syncthreads(); + + if (threadIdx.y == 0) + { + myVal = smem[threadIdx.x]; + + #pragma unroll + for (int i = 1; i < 16; ++i) + myVal = op(myVal, smem[i * 16 + threadIdx.x]); + + dst[x] = saturate_cast(op.result(myVal, src.rows)); + } + } + } + + template