added additional error check

pull/13383/head
Vladislav Vinogradov 13 years ago
parent 79cc05d062
commit ddf56fa629
  1. 57
      modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
  2. 38
      modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
  3. 2
      modules/gpu/src/nvidia/core/NCV.cu
  4. 13
      modules/gpu/src/nvidia/core/NCV.hpp

@ -876,12 +876,12 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
NcvRect32u dstROI (0, 0, level_width, level_height);
// frame 0
nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
// frame 1
nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
}
I0 = level_frame0.release();
@ -962,32 +962,32 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
// Ix0
nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Iy0
nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Ix
nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Iy
nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Ixx
nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Iyy
nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
// Ixy
nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
@ -1030,6 +1030,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
alpha,
gamma);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@ -1040,6 +1042,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
// linear system coefficients
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@ -1073,6 +1077,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@ -1089,6 +1095,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}//end of solver loop
@ -1096,7 +1104,9 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
//update u and v
add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
//prolongate using texture
pyr.w.pop_back();
@ -1116,15 +1126,17 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
NcvRect32u dstROI (0, 0, nw, nh);
nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
@ -1143,7 +1155,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
}

@ -315,7 +315,8 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
<T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -768,7 +769,7 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
}
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -997,7 +998,7 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
(NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
}
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -1157,7 +1158,7 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
<T>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStride, d_dst, dstStride, srcRoi);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -1407,7 +1408,8 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
d_hierSums.ptr(),
d_hierSums.ptr() + partSumOffsets[1],
elemRemove);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
//calculate hierarchical partial sums
for (Ncv32u i=1; i<partSumNums.size()-1; i++)
@ -1438,7 +1440,8 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
NULL,
NULL);
}
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
}
//adjust hierarchical partial sums
@ -1454,7 +1457,8 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
d_hierSums.ptr() + partSumOffsets[i+1]);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
}
}
else
@ -1466,7 +1470,8 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
(d_src, srcLen,
d_hierSums.ptr(),
NULL, elemRemove);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
}
//compact source vector using indices
@ -1480,7 +1485,8 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcLen, d_hierSums.ptr(), d_dst,
elemRemove, d_numDstElements.ptr());
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
//get number of dst elements
if (dstLenPinned != NULL)
@ -1773,6 +1779,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderMirror:
FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break;
default:
return NPPST_ERROR;
@ -1842,6 +1849,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderMirror:
FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break;
default:
return NPPST_ERROR;
@ -1946,7 +1954,7 @@ NCVStatus BlendFrames(const Ncv32f *src0,
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
(ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
ncvAssertCUDAReturn (cudaGetLastError (), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -2262,6 +2270,8 @@ NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -2294,12 +2304,18 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(0, srcSize.width, srcSize.height, pBuffer);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -2557,5 +2573,7 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
status = NPPST_ERROR;
}
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return status;
}

@ -874,7 +874,7 @@ static NCVStatus drawRectsWrapperDevice(T *d_dst,
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
return NCV_SUCCESS;
}

@ -285,9 +285,16 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
#define ncvAssertCUDAReturn(cudacall, errCode) \
do \
{ \
cudaError_t resCall = cudacall; \
cudaError_t resGLE = cudaGetLastError(); \
ncvAssertPrintReturn(cudaSuccess==resCall && cudaSuccess==resGLE, "cudaError_t=" << (int)(resCall | resGLE), errCode); \
cudaError_t res = cudacall; \
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
} while (0)
#define ncvAssertCUDALastErrorReturn(errCode) \
do \
{ \
cudaError_t res = cudaGetLastError(); \
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
} while (0)

Loading…
Cancel
Save