From 237b765a5c5abdcce55ba478edb7ca79936da595 Mon Sep 17 00:00:00 2001 From: Mikhail Scherbina <42784580+awarebayes@users.noreply.github.com> Date: Wed, 24 Aug 2022 20:43:17 +0300 Subject: [PATCH 1/2] disable npp in multistream context --- modules/cudawarping/src/warp.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp index a18a459c2..a6041ecc2 100644 --- a/modules/cudawarping/src/warp.cpp +++ b/modules/cudawarping/src/warp.cpp @@ -242,8 +242,7 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation]; // NPP bug on float data - useNpp = useNpp && src.depth() != CV_32F; - + useNpp = useNpp && src.depth() != CV_32F && StreamAccessor::getStream(stream) == nullptr; if (useNpp) { typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream); From 8be686d88d3cc34743031a46199c6db2ed611092 Mon Sep 17 00:00:00 2001 From: Mikhail Scherbina <42784580+awarebayes@users.noreply.github.com> Date: Sat, 27 Aug 2022 17:17:52 +0300 Subject: [PATCH 2/2] Use static dispatch for context --- modules/cudawarping/src/warp.cpp | 113 +++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 22 deletions(-) diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp index a6041ecc2..66b3d0be8 100644 --- a/modules/cudawarping/src/warp.cpp +++ b/modules/cudawarping/src/warp.cpp @@ -45,8 +45,19 @@ using namespace cv; using namespace cv::cuda; +#define canUseContext NPP_VERSION >= (10 * 1000 + 1 * 100 + 0) +#if canUseContext + #define CTX_PREFIX _Ctx +#else + #define CTX_PREFIX +#endif +#define PPCAT_NX(A, B) A ## B +#define PPCAT(A, B) PPCAT_NX(A, B) +#define TRY_CTX(func) PPCAT(func, CTX_PREFIX) + #if !defined HAVE_CUDA || defined(CUDA_DISABLER) + void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); } void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); } @@ -135,7 +146,17 @@ void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize, namespace { - template struct NppWarpFunc + template struct NppWarpFunc + { + typedef typename NPPTypeTraits::npp_type npp_type; + + typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst, + int dstStep, NppiRect dstRoi, const double coeffs[][3], + int interpolation, NppStreamContext stream_ctx); + }; + + template + struct NppWarpFunc { typedef typename NPPTypeTraits::npp_type npp_type; @@ -144,11 +165,24 @@ namespace int interpolation); }; - template ::func_t func> struct NppWarp + template ::func_t func> struct NppWarp { - typedef typename NppWarpFunc::npp_type npp_type; + + typedef typename NppWarpFunc::npp_type npp_type; static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream) + { + call_impl(src, dst, coeffs, interpolation, stream, Int2Type()); + } + + template + struct Int2Type + { + enum { value = I }; + }; + + // with context + static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type) { static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; @@ -168,7 +202,40 @@ namespace dstroi.height = dst.rows; dstroi.width = dst.cols; - cv::cuda::NppStreamHandler h(stream); + NppStatus nppStatus = NPP_SUCCESS; + NppStreamContext nppStreamContext{}; + nppStatus = nppGetStreamContext(&nppStreamContext); + CV_Assert(NPP_SUCCESS == nppStatus); + nppStreamContext.hStream = stream; + + nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, + dst.ptr(), static_cast(dst.step), dstroi, + coeffs, npp_inter[interpolation], nppStreamContext) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + // without context + static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type) + { + static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; + + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + + NppiRect srcroi; + srcroi.x = 0; + srcroi.y = 0; + srcroi.height = src.rows; + srcroi.width = src.cols; + + NppiRect dstroi; + dstroi.x = 0; + dstroi.y = 0; + dstroi.height = dst.rows; + dstroi.width = dst.cols; nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, dst.ptr(), static_cast(dst.step), dstroi, @@ -193,7 +260,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP ); - _dst.create(dsize, src.type()); + if (_dst.size() != dsize) + _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); Size wholeSize; @@ -242,7 +310,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation]; // NPP bug on float data - useNpp = useNpp && src.depth() != CV_32F && StreamAccessor::getStream(stream) == nullptr; + useNpp = useNpp && src.depth() != CV_32F; + if (useNpp) { typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream); @@ -250,20 +319,20 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size static const func_t funcs[2][6][4] = { { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} }, { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} } }; @@ -389,20 +458,20 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, static const func_t funcs[2][6][4] = { { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} }, { - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call}, - {NppWarp::call, 0, NppWarp::call, NppWarp::call} + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} } };