|
|
|
@ -40,27 +40,27 @@ |
|
|
|
|
//
|
|
|
|
|
//M*/
|
|
|
|
|
|
|
|
|
|
#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__ |
|
|
|
|
#define __OPENCV_GPU_UTILITY_DETAIL_HPP__ |
|
|
|
|
#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ |
|
|
|
|
#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ |
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace device
|
|
|
|
|
namespace cv { namespace gpu { namespace device |
|
|
|
|
{ |
|
|
|
|
namespace utility_detail |
|
|
|
|
{ |
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// Reduction
|
|
|
|
|
// Reductor
|
|
|
|
|
|
|
|
|
|
template <int n> struct WarpReductor |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
|
|
|
|
{ |
|
|
|
|
if (tid < n) |
|
|
|
|
data[tid] = partial_reduction;
|
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
if (n > 32) __syncthreads(); |
|
|
|
|
|
|
|
|
|
if (n > 32) |
|
|
|
|
{ |
|
|
|
|
if (tid < n - 32)
|
|
|
|
|
if (tid < n - 32) |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); |
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} |
|
|
|
|
else if (n > 16) |
|
|
|
|
{ |
|
|
|
|
if (tid < n - 16)
|
|
|
|
|
if (tid < n - 16) |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} |
|
|
|
|
else if (n > 8) |
|
|
|
|
{ |
|
|
|
|
if (tid < n - 8)
|
|
|
|
|
if (tid < n - 8) |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); |
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
@ -96,23 +96,23 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} |
|
|
|
|
else if (n > 4) |
|
|
|
|
{ |
|
|
|
|
if (tid < n - 4)
|
|
|
|
|
if (tid < n - 4) |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
|
|
|
|
} |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
else if (n > 2) |
|
|
|
|
{ |
|
|
|
|
if (tid < n - 2)
|
|
|
|
|
if (tid < n - 2) |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
|
|
|
|
} |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
template <> struct WarpReductor<64> |
|
|
|
@ -121,15 +121,15 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
if (tid < 32)
|
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
@ -138,14 +138,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
|
|
|
|
|
if (tid < 16)
|
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
@ -154,13 +154,13 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
|
|
|
|
|
if (tid < 8)
|
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
@ -169,12 +169,12 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction; |
|
|
|
|
|
|
|
|
|
if (tid < 4)
|
|
|
|
|
|
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
|
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
@ -214,11 +214,11 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// PredValWarpReductor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int n> struct PredValWarpReductor; |
|
|
|
|
template <> struct PredValWarpReductor<64> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V, typename Pred>
|
|
|
|
|
template <typename T, typename V, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 32) |
|
|
|
@ -253,14 +253,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -272,7 +272,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
}; |
|
|
|
|
template <> struct PredValWarpReductor<32> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V, typename Pred>
|
|
|
|
|
template <typename T, typename V, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 16) |
|
|
|
@ -300,14 +300,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
template <> struct PredValWarpReductor<16> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V, typename Pred>
|
|
|
|
|
template <typename T, typename V, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 8) |
|
|
|
@ -341,14 +341,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -360,7 +360,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
}; |
|
|
|
|
template <> struct PredValWarpReductor<8> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V, typename Pred>
|
|
|
|
|
template <typename T, typename V, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 4) |
|
|
|
@ -374,14 +374,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -407,7 +407,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
myData = sdata[tid]; |
|
|
|
|
myVal = sval[tid]; |
|
|
|
|
|
|
|
|
|
if (n >= 512 && tid < 256)
|
|
|
|
|
if (n >= 512 && tid < 256) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 256]; |
|
|
|
|
|
|
|
|
@ -416,9 +416,9 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 256]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
if (n >= 256 && tid < 128)
|
|
|
|
|
if (n >= 256 && tid < 128) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 128]; |
|
|
|
|
|
|
|
|
@ -427,9 +427,9 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 128]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
if (n >= 128 && tid < 64)
|
|
|
|
|
if (n >= 128 && tid < 64) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 64]; |
|
|
|
|
|
|
|
|
@ -438,13 +438,13 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 64]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
}
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
if (n >= 64)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 64) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 32]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -453,8 +453,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval[tid] = myVal = sval[tid + 32]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 32)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 32) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 16]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -463,8 +463,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval[tid] = myVal = sval[tid + 16]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 16)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 16) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 8]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -473,8 +473,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval[tid] = myVal = sval[tid + 8]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 8)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 8) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 4]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -483,18 +483,18 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval[tid] = myVal = sval[tid + 4]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 4)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 4) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 2]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval[tid] = myVal = sval[tid + 2]; |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 2)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 2) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 1]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -513,7 +513,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
template <int n> struct PredVal2WarpReductor; |
|
|
|
|
template <> struct PredVal2WarpReductor<64> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 32) |
|
|
|
@ -553,7 +553,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 4]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -561,7 +561,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 2]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
}; |
|
|
|
|
template <> struct PredVal2WarpReductor<32> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 16) |
|
|
|
@ -606,7 +606,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 4]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -614,7 +614,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 2]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -628,7 +628,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
template <> struct PredVal2WarpReductor<16> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 8) |
|
|
|
@ -652,7 +652,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 4]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -660,7 +660,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 2]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -673,7 +673,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
}; |
|
|
|
|
template <> struct PredVal2WarpReductor<8> |
|
|
|
|
{ |
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
if (tid < 4) |
|
|
|
@ -689,7 +689,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 4]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 2]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -697,7 +697,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 2]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 2]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = sdata[tid + 1]; |
|
|
|
|
if (pred(reg, myData)) |
|
|
|
|
{ |
|
|
|
@ -712,7 +712,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
template <bool warp> struct PredVal2ReductionDispatcher; |
|
|
|
|
template <> struct PredVal2ReductionDispatcher<true> |
|
|
|
|
{ |
|
|
|
|
template <int n, typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <int n, typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); |
|
|
|
@ -720,14 +720,14 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
}; |
|
|
|
|
template <> struct PredVal2ReductionDispatcher<false> |
|
|
|
|
{ |
|
|
|
|
template <int n, typename T, typename V1, typename V2, typename Pred>
|
|
|
|
|
template <int n, typename T, typename V1, typename V2, typename Pred> |
|
|
|
|
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
|
|
|
|
{ |
|
|
|
|
myData = sdata[tid]; |
|
|
|
|
myVal1 = sval1[tid]; |
|
|
|
|
myVal2 = sval2[tid]; |
|
|
|
|
|
|
|
|
|
if (n >= 512 && tid < 256)
|
|
|
|
|
if (n >= 512 && tid < 256) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 256]; |
|
|
|
|
|
|
|
|
@ -737,9 +737,9 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 256]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 256]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
if (n >= 256 && tid < 128)
|
|
|
|
|
if (n >= 256 && tid < 128) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 128]; |
|
|
|
|
|
|
|
|
@ -749,9 +749,9 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 128]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 128]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
if (n >= 128 && tid < 64)
|
|
|
|
|
if (n >= 128 && tid < 64) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 64]; |
|
|
|
|
|
|
|
|
@ -761,13 +761,13 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 64]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 64]; |
|
|
|
|
} |
|
|
|
|
__syncthreads();
|
|
|
|
|
}
|
|
|
|
|
__syncthreads(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
if (n >= 64)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 64) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 32]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -777,8 +777,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 32]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 32)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 32) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 16]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -788,8 +788,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 16]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 16)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 16) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 8]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -799,8 +799,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 8]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 8)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 8) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 4]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -810,8 +810,8 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 4)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 4) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 2]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -819,10 +819,10 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
sdata[tid] = myData = reg; |
|
|
|
|
sval1[tid] = myVal1 = sval1[tid + 2]; |
|
|
|
|
sval2[tid] = myVal2 = sval2[tid + 2]; |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (n >= 2)
|
|
|
|
|
{
|
|
|
|
|
if (n >= 2) |
|
|
|
|
{ |
|
|
|
|
T reg = sdata[tid + 1]; |
|
|
|
|
|
|
|
|
|
if (pred(reg, myData)) |
|
|
|
@ -838,4 +838,4 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
} // namespace utility_detail
|
|
|
|
|
}}} // namespace cv { namespace gpu { namespace device
|
|
|
|
|
|
|
|
|
|
#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__
|
|
|
|
|
#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
|