diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp index 8690f5408..49772e268 100644 --- a/modules/cudawarping/src/warp.cpp +++ b/modules/cudawarping/src/warp.cpp @@ -45,8 +45,19 @@ using namespace cv; using namespace cv::cuda; +#define canUseContext NPP_VERSION >= (10 * 1000 + 1 * 100 + 0) +#if canUseContext + #define CTX_PREFIX _Ctx +#else + #define CTX_PREFIX +#endif +#define PPCAT_NX(A, B) A ## B +#define PPCAT(A, B) PPCAT_NX(A, B) +#define TRY_CTX(func) PPCAT(func, CTX_PREFIX) + #if !defined HAVE_CUDA || defined(CUDA_DISABLER) + void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); } void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); } @@ -135,7 +146,17 @@ void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize, namespace { - template struct NppWarpFunc + template struct NppWarpFunc + { + typedef typename NPPTypeTraits::npp_type npp_type; + + typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst, + int dstStep, NppiRect dstRoi, const double coeffs[][3], + int interpolation, NppStreamContext stream_ctx); + }; + + template + struct NppWarpFunc { typedef typename NPPTypeTraits::npp_type npp_type; @@ -150,11 +171,24 @@ namespace #endif }; - template ::func_t func> struct NppWarp + template ::func_t func> struct NppWarp { - typedef typename NppWarpFunc::npp_type npp_type; + + typedef typename NppWarpFunc::npp_type npp_type; static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream) + { + call_impl(src, dst, coeffs, interpolation, stream, Int2Type()); + } + + template + struct Int2Type + { + enum { value = I }; + }; + + // with context + static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type) { static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; @@ -174,7 +208,40 @@ namespace dstroi.height = dst.rows; dstroi.width = dst.cols; - cv::cuda::NppStreamHandler h(stream); + NppStatus nppStatus = NPP_SUCCESS; + NppStreamContext nppStreamContext{}; + nppStatus = nppGetStreamContext(&nppStreamContext); + CV_Assert(NPP_SUCCESS == nppStatus); + nppStreamContext.hStream = stream; + + nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, + dst.ptr(), static_cast(dst.step), dstroi, + coeffs, npp_inter[interpolation], nppStreamContext) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + // without context + static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type) + { + static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; + + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + + NppiRect srcroi; + srcroi.x = 0; + srcroi.y = 0; + srcroi.height = src.rows; + srcroi.width = src.cols; + + NppiRect dstroi; + dstroi.x = 0; + dstroi.y = 0; + dstroi.height = dst.rows; + dstroi.width = dst.cols; #if USE_NPP_STREAM_CTX nppSafeCall(func(src.ptr(), srcsz, static_cast(src.step), srcroi, @@ -205,7 +272,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP ); - _dst.create(dsize, src.type()); + if (_dst.size() != dsize) + _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); Size wholeSize; @@ -281,20 +349,20 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size } #else { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} }, { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} } #endif }; @@ -439,20 +507,20 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, } #else { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} }, { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} } #endif };