[*] Approach to the bug with integral image calculation on SM_2.0 (Fermi)

pull/13383/head
Anton Obukhov 14 years ago
parent 276a19d354
commit c6a7432e92
  1. 17
      modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
  2. 17
      modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu

@ -71,6 +71,9 @@
//==============================================================================
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2
template <class T>
@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
pos += K_WARP_SIZE;
s_Data[pos] = idata;
for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
{
s_Data[pos] += s_Data[pos - offset];
}
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
//{
// s_Data[pos] += s_Data[pos - offset];
//}
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
}

@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
//==============================================================================
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2
template <class T>
@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
pos += K_WARP_SIZE;
s_Data[pos] = idata;
for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
{
s_Data[pos] += s_Data[pos - offset];
}
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
//{
// s_Data[pos] += s_Data[pos - offset];
//}
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
}

Loading…
Cancel
Save