Merge pull request #3843 from vrabaud:cuda

Get cudalegacy to compile with clang CUDA and without CUDA
pull/3849/head
Alexander Smorkalov 5 months ago committed by GitHub
commit 8e6285a79b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      modules/cudalegacy/include/opencv2/cudalegacy.hpp
  2. 3
      modules/cudalegacy/src/NCV.cpp
  3. 2
      modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
  4. 4
      modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
  5. 4
      modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
  6. 32
      modules/cudalegacy/src/cuda/NPP_staging.cu
  7. 6
      modules/cudalegacy/src/cuda/needle_map.cu
  8. 2
      modules/cudalegacy/src/precomp.hpp

@ -44,11 +44,13 @@
#define OPENCV_CUDALEGACY_HPP #define OPENCV_CUDALEGACY_HPP
#include "opencv2/core/cuda.hpp" #include "opencv2/core/cuda.hpp"
#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
#include "opencv2/cudalegacy/NCV.hpp" #include "opencv2/cudalegacy/NCV.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp"
#include "opencv2/cudalegacy/NCVPyramid.hpp" #include "opencv2/cudalegacy/NCVPyramid.hpp"
#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp" #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
#include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp" #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
#endif
#include "opencv2/video/background_segm.hpp" #include "opencv2/video/background_segm.hpp"
/** /**

@ -42,6 +42,8 @@
#include "precomp.hpp" #include "precomp.hpp"
#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
//============================================================================== //==============================================================================
// //
// Error handling helpers // Error handling helpers
@ -886,3 +888,4 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
{ {
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color); return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
} }
#endif

@ -695,8 +695,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
//prepare image pyramid //prepare image pyramid
ImagePyramid pyr(desc.number_of_outer_iterations); ImagePyramid pyr(desc.number_of_outer_iterations);
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
float scale = 1.0f; float scale = 1.0f;
//cuda arrays for frames //cuda arrays for frames

@ -193,7 +193,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
if (tbDoAtomicCompaction) bInactiveThread = true; else return; if (tbDoAtomicCompaction) bInactiveThread = true; else return;
} }
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
{ {
outMaskVal = d_inMask[maskOffset]; outMaskVal = d_inMask[maskOffset];
y_offs = outMaskVal >> 16; y_offs = outMaskVal >> 16;
@ -210,7 +210,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
if (tbDoAtomicCompaction) bInactiveThread = true; else return; if (tbDoAtomicCompaction) bInactiveThread = true; else return;
} }
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
{ {
maskOffset = y_offs * mask2Dstride + x_offs; maskOffset = y_offs * mask2Dstride + x_offs;

@ -84,7 +84,7 @@ template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;}; template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;}; template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
#define NC(T) (sizeof(T) / sizeof(TConvVec2Base<T>::TBase)) #define NC(T) (sizeof(T) / sizeof(typename TConvVec2Base<T>::TBase))
template<typename TBase, Ncv32u NC> struct TConvBase2Vec; template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
template<> struct TConvBase2Vec<Ncv8u, 1> {typedef uchar1 TVec;}; template<> struct TConvBase2Vec<Ncv8u, 1> {typedef uchar1 TVec;};
@ -115,7 +115,7 @@ template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, N
template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);} template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;} template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
template<typename Tout> inline Tout _pixMakeZero(); template<typename Tout> inline __host__ __device__ Tout _pixMakeZero();
template<> inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);} template<> inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
template<> inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);} template<> inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
template<> inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);} template<> inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}

@ -85,26 +85,24 @@ const Ncv32u NUM_SCAN_THREADS = 256;
const Ncv32u LOG2_NUM_SCAN_THREADS = 8; const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
template<class T_in, class T_out> template<class T_in, class T_out, bool tbDoSqr>
struct _scanElemOp struct _scanElemOp
{ {
template<bool tbDoSqr> static __host__ __device__ T_out scanElemOp(T_in elem);
static inline __host__ __device__ T_out scanElemOp(T_in elem) };
{
return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
}
private:
template <int v> struct Int2Type { enum { value = v }; };
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>) template<class T_in, class T_out>
{ struct _scanElemOp<T_in, T_out, false>
return (T_out)elem; {
static inline __host__ __device__ T_out scanElemOp(T_in elem) {
return (T_out)(elem);
} }
};
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>) template<class T_in, class T_out>
{ struct _scanElemOp<T_in, T_out, true>
{
static inline __host__ __device__ T_out scanElemOp(T_in elem) {
return (T_out)(elem*elem); return (T_out)(elem*elem);
} }
}; };
@ -177,7 +175,7 @@ __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u
Ncv32u curElemOffs = offsetX + threadIdx.x; Ncv32u curElemOffs = offsetX + threadIdx.x;
T_out curScanElem; T_out curScanElem;
T_in curElem; T_in curElem = 0;
T_out curElemMod; T_out curElemMod;
if (curElemOffs < srcWidth) if (curElemOffs < srcWidth)
@ -185,7 +183,7 @@ __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u
//load elements //load elements
curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs); curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
} }
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem); curElemMod = _scanElemOp<T_in, T_out, tbDoSqr>::scanElemOp(curElem);
//inclusive scan //inclusive scan
curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x); curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x);

@ -76,19 +76,19 @@ namespace cv { namespace cuda { namespace device
// now add the column sums // now add the column sums
const uint X = threadIdx.x; const uint X = threadIdx.x;
if (X | 0xfe == 0xfe) // bit 0 is 0 if (X | (0xfe == 0xfe)) // bit 0 is 0
{ {
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1]; u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1]; v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
} }
if (X | 0xfe == 0xfc) // bits 0 & 1 == 0 if (X | (0xfe == 0xfc)) // bits 0 & 1 == 0
{ {
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2]; u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2]; v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
} }
if (X | 0xf8 == 0xf8) if (X | (0xf8 == 0xf8))
{ {
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4]; u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4]; v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];

@ -80,6 +80,8 @@
#endif #endif
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
#include "opencv2/cudalegacy/private.hpp" #include "opencv2/cudalegacy/private.hpp"
#endif
#endif /* __OPENCV_PRECOMP_H__ */ #endif /* __OPENCV_PRECOMP_H__ */

Loading…
Cancel
Save