From 237b765a5c5abdcce55ba478edb7ca79936da595 Mon Sep 17 00:00:00 2001
From: Mikhail Scherbina <42784580+awarebayes@users.noreply.github.com>
Date: Wed, 24 Aug 2022 20:43:17 +0300
Subject: [PATCH 1/2] disable npp in multistream context

---
 modules/cudawarping/src/warp.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
index a18a459c2..a6041ecc2 100644
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -242,8 +242,7 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
 
     bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
     // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F;
-
+    useNpp = useNpp && src.depth() != CV_32F && StreamAccessor::getStream(stream) == nullptr;
     if (useNpp)
     {
         typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

From 8be686d88d3cc34743031a46199c6db2ed611092 Mon Sep 17 00:00:00 2001
From: Mikhail Scherbina <42784580+awarebayes@users.noreply.github.com>
Date: Sat, 27 Aug 2022 17:17:52 +0300
Subject: [PATCH 2/2] Use static dispatch for context

---
 modules/cudawarping/src/warp.cpp | 113 +++++++++++++++++++++++++------
 1 file changed, 91 insertions(+), 22 deletions(-)

diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
index a6041ecc2..66b3d0be8 100644
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -45,8 +45,19 @@
 using namespace cv;
 using namespace cv::cuda;
 
+#define canUseContext NPP_VERSION >= (10 * 1000 + 1 * 100 + 0)
+#if canUseContext
+    #define CTX_PREFIX _Ctx
+#else
+    #define CTX_PREFIX
+#endif
+#define PPCAT_NX(A, B) A ## B
+#define PPCAT(A, B) PPCAT_NX(A, B)
+#define TRY_CTX(func) PPCAT(func, CTX_PREFIX)
+
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
+
 void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
 void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
@@ -135,7 +146,17 @@ void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize,
 
 namespace
 {
-    template <int DEPTH> struct NppWarpFunc
+    template <int DEPTH, bool CanUseContext=true> struct NppWarpFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst,
+                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
+                                    int interpolation, NppStreamContext stream_ctx);
+    };
+
+    template <int DEPTH>
+    struct NppWarpFunc<DEPTH, false>
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
@@ -144,11 +165,24 @@ namespace
                                     int interpolation);
     };
 
-    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
+    template <int DEPTH, typename NppWarpFunc<DEPTH, canUseContext>::func_t func> struct NppWarp
     {
-        typedef typename NppWarpFunc<DEPTH>::npp_type npp_type;
+
+        typedef typename NppWarpFunc<DEPTH, canUseContext>::npp_type npp_type;
 
         static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
+        {
+            call_impl(src, dst, coeffs, interpolation, stream, Int2Type<canUseContext>());
+        }
+
+        template <int I>
+        struct Int2Type
+        {
+          enum { value = I };
+        };
+
+        // with context
+        static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type<true>)
         {
             static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
 
@@ -168,7 +202,40 @@ namespace
             dstroi.height = dst.rows;
             dstroi.width = dst.cols;
 
-            cv::cuda::NppStreamHandler h(stream);
+            NppStatus nppStatus = NPP_SUCCESS;
+            NppStreamContext nppStreamContext{};
+            nppStatus = nppGetStreamContext(&nppStreamContext);
+            CV_Assert(NPP_SUCCESS == nppStatus);
+            nppStreamContext.hStream = stream;
+
+            nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation], nppStreamContext) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        // without context
+        static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type<false>)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+
+            NppiRect srcroi;
+            srcroi.x = 0;
+            srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+
+            NppiRect dstroi;
+            dstroi.x = 0;
+            dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
 
             nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
                               dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
@@ -193,7 +260,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
     CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
     CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
 
-    _dst.create(dsize, src.type());
+    if (_dst.size() != dsize)
+        _dst.create(dsize, src.type());
     GpuMat dst = _dst.getGpuMat();
 
     Size wholeSize;
@@ -242,7 +310,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
 
     bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
     // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F && StreamAccessor::getStream(stream) == nullptr;
+    useNpp = useNpp && src.depth() != CV_32F;
+
     if (useNpp)
     {
         typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
@@ -250,20 +319,20 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
         static const func_t funcs[2][6][4] =
         {
             {
-                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
+                {NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
+                {NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
+                {NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C4R)>::call},
+                {NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C4R)>::call}
             },
             {
-                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
+                {NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
+                {NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
+                {NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C4R)>::call},
+                {NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C4R)>::call}
             }
         };
 
@@ -389,20 +458,20 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M,
         static const func_t funcs[2][6][4] =
         {
             {
-                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
+                {NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
+                {NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
+                {NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C4R)>::call},
+                {NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C4R)>::call}
             },
             {
-                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
+                {NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
+                {NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C4R)>::call},
                 {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
+                {NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C4R)>::call},
+                {NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C4R)>::call}
             }
         };