/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #if !defined CUDA_DISABLER #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/vec_traits.hpp" #include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/limits.hpp" #include "unroll_detail.hpp" using namespace cv::cuda; using namespace cv::cuda::device; namespace reduce { struct Sum { template __device__ __forceinline__ T startValue() const { return VecTraits::all(0); } template __device__ __forceinline__ T operator ()(T a, T b) const { return a + b; } template __device__ __forceinline__ T result(T r, int) const { return r; } __host__ __device__ __forceinline__ Sum() {} __host__ __device__ __forceinline__ Sum(const Sum&) {} }; template struct OutputType { typedef float type; }; template <> struct OutputType { typedef double type; }; struct Avg { template __device__ __forceinline__ T startValue() const { return VecTraits::all(0); } template __device__ __forceinline__ T operator ()(T a, T b) const { return a + b; } template __device__ __forceinline__ typename TypeVec::elem_type>::type, VecTraits::cn>::vec_type result(T r, float sz) const { return r / sz; } __host__ __device__ __forceinline__ Avg() {} __host__ __device__ __forceinline__ Avg(const Avg&) {} }; struct Min { template __device__ __forceinline__ T startValue() const { return VecTraits::all(numeric_limits::elem_type>::max()); } template __device__ __forceinline__ T operator ()(T a, T b) const { minimum minOp; return minOp(a, b); } template __device__ __forceinline__ T result(T r, int) const { return r; } __host__ __device__ __forceinline__ Min() {} __host__ __device__ __forceinline__ Min(const Min&) {} }; struct Max { template __device__ __forceinline__ T startValue() const { return VecTraits::all(-numeric_limits::elem_type>::max()); } template __device__ __forceinline__ T operator ()(T a, T b) const { maximum maxOp; return maxOp(a, b); } template __device__ __forceinline__ T result(T r, int) const { return r; } __host__ __device__ __forceinline__ Max() {} __host__ __device__ __forceinline__ Max(const Max&) {} }; /////////////////////////////////////////////////////////// template __global__ void rowsKernel(const PtrStepSz src, D* dst, const Op op) { __shared__ S smem[16 * 16]; const int x = blockIdx.x * 16 + threadIdx.x; S myVal = op.template startValue(); if (x < src.cols) { for (int y = threadIdx.y; y < src.rows; y += 16) { S srcVal = src(y, x); myVal = op(myVal, srcVal); } } smem[threadIdx.x * 16 + threadIdx.y] = myVal; __syncthreads(); volatile S* srow = smem + threadIdx.y * 16; myVal = srow[threadIdx.x]; device::reduce<16>(srow, myVal, threadIdx.x, op); if (threadIdx.x == 0) srow[0] = myVal; __syncthreads(); if (threadIdx.y == 0 && x < src.cols) dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows); } template void rowsCaller(PtrStepSz src, D* dst, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(src.cols, block.x)); Op op; rowsKernel<<>>(src, dst, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream) { typedef void (*func_t)(PtrStepSz src, D* dst, cudaStream_t stream); static const func_t funcs[] = { rowsCaller, rowsCaller, rowsCaller, rowsCaller }; funcs[op]((PtrStepSz) src, (D*) dst, stream); } template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); /////////////////////////////////////////////////////////// template __global__ void colsKernel(const PtrStepSz::vec_type> src, typename TypeVec::vec_type* dst, const Op op) { typedef typename TypeVec::vec_type src_type; typedef typename TypeVec::vec_type work_type; typedef typename TypeVec::vec_type dst_type; __shared__ S smem[BLOCK_SIZE * cn]; const int y = blockIdx.x; const src_type* srcRow = src.ptr(y); work_type myVal = op.template startValue(); for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE) myVal = op(myVal, saturate_cast(srcRow[x])); device::reduce(detail::Unroll::template smem_tuple(smem), detail::Unroll::tie(myVal), threadIdx.x, detail::Unroll::op(op)); if (threadIdx.x == 0) dst[y] = saturate_cast(op.result(myVal, src.cols)); } template void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream) { const int BLOCK_SIZE = 256; const dim3 block(BLOCK_SIZE); const dim3 grid(src.rows); Op op; colsKernel<<>>((PtrStepSz::vec_type>) src, (typename TypeVec::vec_type*) dst, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream); static const func_t funcs[5][4] = { {0,0,0,0}, {colsCaller, colsCaller, colsCaller, colsCaller}, {colsCaller, colsCaller, colsCaller, colsCaller}, {colsCaller, colsCaller, colsCaller, colsCaller}, {colsCaller, colsCaller, colsCaller, colsCaller}, }; funcs[cn][op](src, dst, stream); } template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); } #endif /* CUDA_DISABLER */