From 6ca24c818f8266c3a7e554e7a583dcdf5dc2010f Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Wed, 17 Jun 2020 01:04:22 +0900 Subject: [PATCH] [moved from opencv] cuda optflow tvl1 : async safety also modify cuda canny to use createTextureObjectPitch2D, etc. original commit: https://github.com/opencv/opencv/commit/2043e06102fadd5df4e52853d8f08f0510763aff --- modules/cudaimgproc/src/cuda/canny.cu | 67 ++++++-------- modules/cudaimgproc/test/test_canny.cpp | 2 +- modules/cudaoptflow/src/cuda/tvl1flow.cu | 107 ++++++++++++++++++++--- 3 files changed, 123 insertions(+), 53 deletions(-) diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index 4418b8e5e..253287ca3 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -90,53 +90,47 @@ namespace cv { namespace cuda { namespace device namespace canny { - texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); struct SrcTex { + virtual ~SrcTex() {} + + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} + + __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0; + int xoff; int yoff; - __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} + }; + + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTexRef : SrcTex + { + __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {} - __device__ __forceinline__ int operator ()(int y, int x) const + __device__ __forceinline__ int operator ()(int y, int x) const override { return tex2D(tex_src, x + xoff, y + yoff); } }; - struct SrcTexObject + struct SrcTexObj : SrcTex { - int xoff; - int yoff; - cudaTextureObject_t tex_src_object; - __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { } + __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { } - __device__ __forceinline__ int operator ()(int y, int x) const + __device__ __forceinline__ int operator ()(int y, int x) const override { return tex2D(tex_src_object, x + xoff, y + yoff); } + cudaTextureObject_t tex_src_object; }; - template __global__ - void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (y >= mag.rows || x >= mag.cols) - return; - - int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); - int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); - - dx(y, x) = dxVal; - dy(y, x) = dyVal; - - mag(y, x) = norm(dxVal, dyVal); - } - - template __global__ - void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + template < + class T, + class Norm, + typename = std::enable_if_t::value> + > + __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -162,15 +156,6 @@ namespace canny if (cc30) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = srcWhole.ptr(); - resDesc.res.pitch2D.height = srcWhole.rows; - resDesc.res.pitch2D.width = srcWhole.cols; - resDesc.res.pitch2D.pitchInBytes = srcWhole.step; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; @@ -178,9 +163,9 @@ namespace canny texDesc.addressMode[2] = cudaAddressModeClamp; cudaTextureObject_t tex = 0; - cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + createTextureObjectPitch2D(&tex, srcWhole, texDesc); - SrcTexObject src(xoff, yoff, tex); + SrcTexObj src(xoff, yoff, tex); if (L2Grad) { @@ -205,7 +190,7 @@ namespace canny else { bindTexture(&tex_src, srcWhole); - SrcTex src(xoff, yoff); + SrcTexRef src(xoff, yoff); if (L2Grad) { diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp index a782a87b3..1b48e7d32 100644 --- a/modules/cudaimgproc/test/test_canny.cpp +++ b/modules/cudaimgproc/test/test_canny.cpp @@ -116,7 +116,7 @@ protected: bool useL2gradient; }; -#define NUM_STREAMS 64 +#define NUM_STREAMS 128 CUDA_TEST_P(Canny, Async) { diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu index 66f0d664a..2688e05c6 100644 --- a/modules/cudaoptflow/src/cuda/tvl1flow.cu +++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu @@ -45,6 +45,7 @@ #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/limits.hpp" +#include "opencv2/core/cuda.hpp" using namespace cv::cuda; using namespace cv::cuda::device; @@ -101,11 +102,64 @@ namespace tvl1flow } } + struct SrcTex + { + virtual ~SrcTex() {} + + __device__ __forceinline__ virtual float I1(float x, float y) const = 0; + __device__ __forceinline__ virtual float I1x(float x, float y) const = 0; + __device__ __forceinline__ virtual float I1y(float x, float y) const = 0; + }; + texture tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp); texture tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp); texture tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTexRef : SrcTex + { + __device__ __forceinline__ float I1(float x, float y) const override + { + return tex2D(tex_I1, x, y); + } + __device__ __forceinline__ float I1x(float x, float y) const override + { + return tex2D(tex_I1x, x, y); + } + __device__ __forceinline__ float I1y(float x, float y) const override + { + return tex2D(tex_I1y, x, y); + } + }; + + struct SrcTexObj : SrcTex + { + __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_) + : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {} + + __device__ __forceinline__ float I1(float x, float y) const override + { + return tex2D(tex_obj_I1, x, y); + } + __device__ __forceinline__ float I1x(float x, float y) const override + { + return tex2D(tex_obj_I1x, x, y); + } + __device__ __forceinline__ float I1y(float x, float y) const override + { + return tex2D(tex_obj_I1y, x, y); + } - __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) + cudaTextureObject_t tex_obj_I1; + cudaTextureObject_t tex_obj_I1x; + cudaTextureObject_t tex_obj_I1y; + }; + + template < + typename T, + typename = std::enable_if_t::value> + > + __global__ void warpBackwardKernel( + const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2, + PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -136,9 +190,9 @@ namespace tvl1flow { const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); - sum += w * tex2D(tex_I1 , cx, cy); - sumx += w * tex2D(tex_I1x, cx, cy); - sumy += w * tex2D(tex_I1y, cx, cy); + sum += w * src.I1(cx, cy); + sumx += w * src.I1x(cx, cy); + sumy += w * src.I1y(cx, cy); wsum += w; } @@ -173,15 +227,46 @@ namespace tvl1flow const dim3 block(32, 8); const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); - bindTexture(&tex_I1 , I1); - bindTexture(&tex_I1x, I1x); - bindTexture(&tex_I1y, I1y); + bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); - warpBackwardKernel<<>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); - cudaSafeCall( cudaGetLastError() ); + if (cc30) + { + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.addressMode[1] = cudaAddressModeClamp; + texDesc.addressMode[2] = cudaAddressModeClamp; - if (!stream) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0; + + createTextureObjectPitch2D(&texObj_I1, I1, texDesc); + createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc); + createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc); + + warpBackwardKernel << > > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho); + cudaSafeCall(cudaGetLastError()); + + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); + else + cudaSafeCall(cudaStreamSynchronize(stream)); + + cudaSafeCall(cudaDestroyTextureObject(texObj_I1)); + cudaSafeCall(cudaDestroyTextureObject(texObj_I1x)); + cudaSafeCall(cudaDestroyTextureObject(texObj_I1y)); + } + else + { + bindTexture(&tex_I1, I1); + bindTexture(&tex_I1x, I1x); + bindTexture(&tex_I1y, I1y); + + warpBackwardKernel << > > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho); + cudaSafeCall(cudaGetLastError()); + + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); + } } }