|
|
|
@ -46,13 +46,13 @@ |
|
|
|
|
#include "internal_shared.hpp" |
|
|
|
|
#include "saturate_cast.hpp" |
|
|
|
|
|
|
|
|
|
#ifndef __CUDA_ARCH__ |
|
|
|
|
#define __CUDA_ARCH__ 0 |
|
|
|
|
#ifndef __CUDA_ARCH__ |
|
|
|
|
#define __CUDA_ARCH__ 0 |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define OPENCV_GPU_LOG_WARP_SIZE (5) |
|
|
|
|
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) |
|
|
|
|
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
|
|
|
|
#define OPENCV_GPU_LOG_WARP_SIZE (5) |
|
|
|
|
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) |
|
|
|
|
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
|
|
|
|
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) |
|
|
|
|
|
|
|
|
|
#if defined(_WIN64) || defined(__LP64__) |
|
|
|
@ -65,15 +65,15 @@ |
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace device |
|
|
|
|
{ |
|
|
|
|
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
|
|
|
|
|
{ |
|
|
|
|
T temp = a; |
|
|
|
|
a = b; |
|
|
|
|
b = temp; |
|
|
|
|
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
|
|
|
|
|
{ |
|
|
|
|
T temp = a; |
|
|
|
|
a = b; |
|
|
|
|
b = temp; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// warp-synchronous 32 elements reduction
|
|
|
|
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op) |
|
|
|
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
|
|
|
|
@ -88,7 +88,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// warp-synchronous 16 elements reduction
|
|
|
|
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op) |
|
|
|
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
|
|
|
|
@ -102,7 +102,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// warp-synchronous reduction
|
|
|
|
|
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op) |
|
|
|
|
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op) |
|
|
|
|
{ |
|
|
|
|
if (tid < n) |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|