Merge pull request #3378 from cudawarped:replace_texture_ref_with_texture_obj

Fix CUDA texture bugs and replace all instances of CUDA texture references with texture objects
pull/3397/head
Alexander Smorkalov 2 years ago committed by GitHub
commit 8db3e627fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 84
      modules/cudaarithm/src/cuda/lut.cu
  2. 6
      modules/cudaarithm/src/lut.hpp
  3. 217
      modules/cudaimgproc/src/cuda/canny.cu
  4. 55
      modules/cudaimgproc/src/cuda/corners.cu
  5. 48
      modules/cudaimgproc/src/cuda/debayer.cu
  6. 52
      modules/cudaimgproc/src/cuda/gftt.cu
  7. 25
      modules/cudaimgproc/src/cuda/hough_segments.cu
  8. 39
      modules/cudaimgproc/src/cuda/mean_shift.cu
  9. 24
      modules/cudaimgproc/src/gftt.cpp
  10. 18
      modules/cudaimgproc/test/test_color.cpp
  11. 64
      modules/cudaimgproc/test/test_hough.cpp
  12. 2
      modules/cudaimgproc/test/test_precomp.hpp
  13. 4
      modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
  14. 10
      modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
  15. 363
      modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
  16. 352
      modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
  17. 428
      modules/cudalegacy/src/cuda/NPP_staging.cu
  18. 24
      modules/cudalegacy/src/cuda/bm.cu
  19. 3
      modules/cudalegacy/test/TestHypothesesGrow.cpp
  20. 52
      modules/cudaobjdetect/src/cuda/hog.cu
  21. 13
      modules/cudaobjdetect/test/test_objdetect.cpp
  22. 306
      modules/cudaoptflow/src/cuda/pyrlk.cu
  23. 113
      modules/cudaoptflow/src/cuda/tvl1flow.cu
  24. 39
      modules/cudastereo/src/cuda/stereobm.cu
  25. 189
      modules/cudawarping/src/cuda/remap.cu
  26. 112
      modules/cudawarping/src/cuda/resize.cu
  27. 161
      modules/cudawarping/src/cuda/warp.cu
  28. 2
      modules/cudawarping/test/test_precomp.hpp
  29. 54
      modules/cudawarping/test/test_resize.cpp
  30. 429
      modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
  31. 2
      modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
  32. 140
      modules/xfeatures2d/src/cuda/surf.cu
  33. 34
      modules/xfeatures2d/src/surf.cuda.cpp

@ -53,6 +53,7 @@
#include "opencv2/cudaarithm.hpp" #include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp" #include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv; using namespace cv;
using namespace cv::cuda; using namespace cv::cuda;
@ -60,8 +61,6 @@ using namespace cv::cudev;
namespace cv { namespace cuda { namespace cv { namespace cuda {
texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
LookUpTableImpl::LookUpTableImpl(InputArray _lut) LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{ {
if (_lut.kind() == _InputArray::CUDA_GPU_MAT) if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
@ -73,83 +72,28 @@ namespace cv { namespace cuda {
Mat h_lut = _lut.getMat(); Mat h_lut = _lut.getMat();
d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data)); d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
} }
CV_Assert( d_lut.depth() == CV_8U ); CV_Assert( d_lut.depth() == CV_8U );
CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 ); CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
szInBytes = 256 * d_lut.channels() * sizeof(uchar);
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (cc30)
{
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypeLinear;
texRes.res.linear.devPtr = d_lut.data;
texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
}
else
{
// Use the texture reference
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
}
}
LookUpTableImpl::~LookUpTableImpl()
{
if (cc30)
{
// Use the texture object
cudaDestroyTextureObject(texLutTableObj);
}
else
{
// Use the texture reference
cudaUnbindTexture(texLutTable);
}
} }
struct LutTablePtrC1 struct LutTablePtrC1
{ {
typedef uchar value_type; typedef uchar value_type;
typedef uchar index_type; typedef uchar index_type;
cv::cudev::TexturePtr<uchar> tex;
cudaTextureObject_t texLutTableObj; __device__ __forceinline__ uchar operator ()(uchar, uchar x) const {
return tex(x);
__device__ __forceinline__ uchar operator ()(uchar, uchar x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return tex1Dfetch(texLutTable, x);
#else
// Use the texture object
return tex1Dfetch<uchar>(texLutTableObj, x);
#endif
} }
}; };
struct LutTablePtrC3 struct LutTablePtrC3
{ {
typedef uchar3 value_type; typedef uchar3 value_type;
typedef uchar3 index_type; typedef uchar3 index_type;
cv::cudev::TexturePtr<uchar> tex;
cudaTextureObject_t texLutTableObj; __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const {
return make_uchar3(tex(x.x * 3), tex(x.y * 3 + 1), tex(x.z * 3 + 2));
__device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
#else
// Use the texture object
return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
#endif
} }
}; };
@ -169,20 +113,18 @@ namespace cv { namespace cuda {
{ {
GpuMat_<uchar> src1(src.reshape(1)); GpuMat_<uchar> src1(src.reshape(1));
GpuMat_<uchar> dst1(dst.reshape(1)); GpuMat_<uchar> dst1(dst.reshape(1));
cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
LutTablePtrC1 tbl; LutTablePtrC1 tbl;
tbl.texLutTableObj = texLutTableObj; tbl.tex = TexturePtr<uchar>(tex);
dst1.assign(lut_(src1, tbl), stream); dst1.assign(lut_(src1, tbl), stream);
} }
else if (lut_cn == 3) else if (lut_cn == 3)
{ {
GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src; GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst; GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
LutTablePtrC3 tbl; LutTablePtrC3 tbl;
tbl.texLutTableObj = texLutTableObj; tbl.tex = TexturePtr<uchar>(tex);
dst3.assign(lut_(src3, tbl), stream); dst3.assign(lut_(src3, tbl), stream);
} }

@ -15,14 +15,10 @@ class LookUpTableImpl : public LookUpTable
{ {
public: public:
LookUpTableImpl(InputArray lut); LookUpTableImpl(InputArray lut);
~LookUpTableImpl();
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE; void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE;
private: private:
GpuMat d_lut; GpuMat d_lut;
cudaTextureObject_t texLutTableObj; size_t szInBytes = 0;
bool cc30;
}; };
} } } }

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/utility.hpp" #include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda.hpp" #include "opencv2/core/cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device
namespace canny namespace canny
{ {
struct SrcTex template <class Norm>
{ __global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr<uchar> texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
virtual ~SrcTex() {}
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
__device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
int xoff;
int yoff;
};
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const override
{
return tex2D(tex_src, x + xoff, y + yoff);
}
};
struct SrcTexObj : SrcTex
{
__host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
__device__ __forceinline__ int operator ()(int y, int x) const override
{
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
}
cudaTextureObject_t tex_src_object;
};
template <
class T,
class Norm,
typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
>
__global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -138,8 +100,8 @@ namespace canny
if (y >= mag.rows || x >= mag.cols) if (y >= mag.rows || x >= mag.cols)
return; return;
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1));
dx(y, x) = dxVal; dx(y, x) = dxVal;
dy(y, x) = dyVal; dy(y, x) = dyVal;
@ -151,63 +113,20 @@ namespace canny
{ {
const dim3 block(16, 16); const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
cv::cudev::TextureOff<uchar> texSrc(srcWhole, yoff, xoff);
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); if (L2Grad)
if (cc30)
{ {
cudaTextureDesc texDesc; L2 norm;
memset(&texDesc, 0, sizeof(texDesc)); calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex = 0;
createTextureObjectPitch2D(&tex, srcWhole, texDesc);
SrcTexObj src(xoff, yoff, tex);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
cudaSafeCall( cudaDestroyTextureObject(tex) );
} }
else else
{ {
bindTexture(&tex_src, srcWhole); L1 norm;
SrcTexRef src(xoff, yoff); calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
} }
if (stream == NULL)
cudaSafeCall(cudaDeviceSynchronize());
} }
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@ -229,8 +148,7 @@ namespace canny
namespace canny namespace canny
{ {
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); __global__ void calcMapKernel(cv::cudev::TexturePtr<float> texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{ {
const int CANNY_SHIFT = 15; const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
@ -245,7 +163,7 @@ namespace canny
int dyVal = dy(y, x); int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D(tex_mag, x, y); const float m = texMag(y, x);
dxVal = ::abs(dxVal); dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal); dyVal = ::abs(dyVal);
@ -264,69 +182,17 @@ namespace canny
if (dyVal < tg22x) if (dyVal < tg22x)
{ {
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) if (m > texMag(y, x - 1) && m >= texMag(y, x + 1))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
else if(dyVal > tg67x) else if(dyVal > tg67x)
{ {
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) if (m > texMag(y - 1, x) && m >= texMag(y + 1, x))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
else else
{ {
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s))
edge_type = 1 + (int)(m > high_thresh);
}
}
map(y, x) = edge_type;
}
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
return;
int dxVal = dx(y, x);
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D<float>(tex_mag, x, y);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
dyVal <<= CANNY_SHIFT;
if (dyVal < tg22x)
{
if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
} }
@ -338,47 +204,10 @@ namespace canny
{ {
const dim3 block(16, 16); const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
cv::cudev::Texture<float> texMag(mag);
if (deviceSupports(FEATURE_SET_COMPUTE_30)) calcMapKernel<<<grid, block, 0, stream>>>(texMag, dx, dy, map, low_thresh, high_thresh);
{ if (stream == NULL)
// Use the texture object cudaSafeCall( cudaDeviceSynchronize() );
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = mag.ptr();
resDesc.res.pitch2D.height = mag.rows;
resDesc.res.pitch2D.width = mag.cols;
resDesc.res.pitch2D.pitchInBytes = mag.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex=0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
cudaSafeCall( cudaDestroyTextureObject(tex) );
}
else
{
// Use the texture reference
bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
}
} }
} }

@ -47,6 +47,7 @@
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/border_interpolate.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include "opencv2/opencv_modules.hpp" #include "opencv2/opencv_modules.hpp"
@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device
{ {
/////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst)
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device
{ {
for (int j = jbegin; j < jend; ++j) for (int j = jbegin; j < jend; ++j)
{ {
float dx = tex2D(harrisDxTex, j, i); float dx = texDx(i, j);
float dy = tex2D(harrisDyTex, j, i); float dy = texDy(i, j);
a += dx * dx; a += dx * dx;
b += dx * dy; b += dx * dy;
@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device
} }
template <typename BR, typename BC> template <typename BR, typename BC>
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col) __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device
{ {
const int x = border_row.idx_col(j); const int x = border_row.idx_col(j);
float dx = tex2D(harrisDxTex, x, y); float dx = texDx(y, x);
float dy = tex2D(harrisDyTex, x, y); float dy = texDy(y, x);
a += dx * dx; a += dx * dx;
b += dx * dy; b += dx * dy;
@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device
{ {
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
cv::cudev::Texture<float> texDx(Dx);
bindTexture(&harrisDxTex, Dx); cv::cudev::Texture<float> texDy(Dy);
bindTexture(&harrisDyTex, Dy);
switch (border_type) switch (border_type)
{ {
case BORDER_REFLECT101: case BORDER_REFLECT101:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows)); cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break; break;
case BORDER_REFLECT: case BORDER_REFLECT:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows)); cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break; break;
case BORDER_REPLICATE: case BORDER_REPLICATE:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst); cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst);
break; break;
} }
@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device
/////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst)
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device
{ {
for (int j = jbegin; j < jend; ++j) for (int j = jbegin; j < jend; ++j)
{ {
float dx = tex2D(minEigenValDxTex, j, i); float dx = texMinEigenValDx(i, j);
float dy = tex2D(minEigenValDyTex, j, i); float dy = texMinEigenValDy(i, j);
a += dx * dx; a += dx * dx;
b += dx * dy; b += dx * dy;
@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device
template <typename BR, typename BC> template <typename BR, typename BC>
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col) __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device
{ {
int x = border_row.idx_col(j); int x = border_row.idx_col(j);
float dx = tex2D(minEigenValDxTex, x, y); float dx = texMinEigenValDx(y, x);
float dy = tex2D(minEigenValDyTex, x, y); float dy = texMinEigenValDy(y, x);
a += dx * dx; a += dx * dx;
b += dx * dy; b += dx * dy;
@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device
{ {
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
cv::cudev::Texture<float> texMinEigenValDx(Dx);
bindTexture(&minEigenValDxTex, Dx); cv::cudev::Texture<float> texMinEigenValDy(Dy);
bindTexture(&minEigenValDyTex, Dy);
switch (border_type) switch (border_type)
{ {
case BORDER_REFLECT101: case BORDER_REFLECT101:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows)); cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break; break;
case BORDER_REFLECT: case BORDER_REFLECT:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows)); cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break; break;
case BORDER_REPLICATE: case BORDER_REPLICATE:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst); cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst);
break; break;
} }

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/color.hpp" #include "opencv2/core/cuda/color.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/cudev/ptr2d/texture.hpp"
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device
// //
// ported to CUDA // ported to CUDA
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp); template <typename DstType, class Ptr2D>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
template <typename DstType>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
{ {
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/; const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ; const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device
return; return;
int2 center; int2 center;
center.x = x + sourceOffset.x; center.x = x;
center.y = y + sourceOffset.y; center.y = y;
int4 xCoord; int4 xCoord;
xCoord.x = center.x - 2; xCoord.x = center.x - 2;
@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device
yCoord.z = center.y + 1; yCoord.z = center.y + 1;
yCoord.w = center.y + 2; yCoord.w = center.y + 2;
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0) float C = src(center.y, center.x); // ( 0, 0)
float4 Dvec; float4 Dvec;
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1) Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1)
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1) Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1)
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1) Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1)
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1) Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1)
float4 value; float4 value;
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0 value.x = src(yCoord.x, center.x); // ( 0,-2) A0
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0 value.y = src(yCoord.y, center.x); // ( 0,-1) B0
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0 value.z = src(center.y, xCoord.x); // (-2, 0) E0
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0 value.w = src(center.y, xCoord.y); // (-1, 0) F0
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1) // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1 value.x += src(yCoord.w, center.x); // ( 0, 2) A1
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1 value.y += src(yCoord.z, center.x); // ( 0, 1) B1
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1 value.z += src(center.y, xCoord.w); // ( 2, 0) E1
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1 value.w += src(center.y, xCoord.z); // ( 1, 0) F1
float4 PATTERN; float4 PATTERN;
PATTERN.x = kCx * C; PATTERN.x = kCx * C;
@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
bindTexture(&sourceTex, src); if (sourceOffset.x || sourceOffset.y) {
cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
}
else {
cv::cudev::Texture<uchar> texSrc(src);
MHCdemosaic<dst_t, cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
}
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)

@ -45,36 +45,36 @@
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include <thrust/sort.h> #include <thrust/sort.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp" #include "opencv2/core/cuda/utility.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
namespace gfft namespace gfft
{ {
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter) template <class Mask> __global__ void findCorners(cv::cudev::TexturePtr<float> tex, float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, int *g_counter)
{ {
const int j = blockIdx.x * blockDim.x + threadIdx.x; const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y; const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j)) if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
{ {
float val = tex2D<float>(eigTex, j, i); float val = tex(i, j);
if (val > threshold) if (val > threshold)
{ {
float maxVal = val; float maxVal = val;
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal); maxVal = ::fmax(tex(i - 1, j - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i - 1), maxVal); maxVal = ::fmax(tex(i - 1, j), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal); maxVal = ::fmax(tex(i - 1, j + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal); maxVal = ::fmax(tex(i, j - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal); maxVal = ::fmax(tex(i, j + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal); maxVal = ::fmax(tex(i + 1, j - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i + 1), maxVal); maxVal = ::fmax(tex(i + 1, j), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal); maxVal = ::fmax(tex(i + 1, j + 1), maxVal);
if (val == maxVal) if (val == maxVal)
{ {
@ -87,17 +87,18 @@ namespace cv { namespace cuda { namespace device
} }
} }
int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream) int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
{ {
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
cv::cudev::Texture<float> tex(eig);
dim3 block(16, 16); dim3 block(16, 16);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
if (mask.data) if (mask.data)
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counterPtr); findCorners<<<grid, block, 0, stream>>>(tex, threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols, counterPtr);
else else
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counterPtr); findCorners<<<grid, block, 0, stream>>>(tex, threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols, counterPtr);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -113,27 +114,24 @@ namespace cv { namespace cuda { namespace device
class EigGreater class EigGreater
{ {
public: public:
EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_) EigGreater(cv::cudev::TexturePtr<float> tex_) : tex(tex_) {}
{ __device__ __forceinline__ bool operator()(float2 a, float2 b) const{
} return tex(a.y, a.x) > tex(b.y, b.x);
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
{
return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
} }
cv::cudev::TexturePtr<float> tex;
cudaTextureObject_t eigTex;
}; };
void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream) void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
{ {
cv::cudev::Texture<float> tex(eig);
thrust::device_ptr<float2> ptr(corners); thrust::device_ptr<float2> ptr(corners);
#if THRUST_VERSION >= 100802 #if THRUST_VERSION >= 100802
if (stream) if (stream)
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex)); thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(tex));
else else
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex)); thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(tex));
#else #else
thrust::sort(ptr, ptr + count, EigGreater(eigTex)); thrust::sort(ptr, ptr + count, EigGreater(tex));
#endif #endif
} }
} // namespace optical_flow } // namespace optical_flow

@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device
{ {
namespace hough_segments namespace hough_segments
{ {
__global__ void houghLinesProbabilistic(cv::cudev::Texture<uchar> src, const PtrStepSzi accum, template<class Ptr2D>
__global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum,
int4* out, const int maxSize, int4* out, const int maxSize,
const float rho, const float theta, const float rho, const float theta,
const int lineGap, const int lineLength, const int lineGap, const int lineLength,
@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y)); const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
cv::cudev::GpuMat_<uchar> src_(mask); Size wholeSize;
cv::cudev::Texture<uchar> tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp); Point ofs;
mask.locateROI(wholeSize, ofs);
houghLinesProbabilistic<<<grid, block, 0, stream>>>(tex, accum, if (ofs.x || ofs.y) {
out, maxSize, cv::cudev::TextureOff<uchar> texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x);
rho, theta, houghLinesProbabilistic<cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
lineGap, lineLength, }
mask.rows, mask.cols, else {
counterPtr); cv::cudev::Texture<uchar> texMask(mask);
houghLinesProbabilistic<cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
}
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
int totalCount; int totalCount;
@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device
cudaSafeCall( cudaStreamSynchronize(stream) ); cudaSafeCall( cudaStreamSynchronize(stream) );
totalCount = ::min(totalCount, maxSize); totalCount = ::min(totalCount, maxSize);
return totalCount; return totalCount;
} }
} }

@ -47,19 +47,16 @@
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/border_interpolate.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
namespace imgproc namespace imgproc
{ {
texture<uchar4, 2> tex_meanshift; __device__ short2 do_mean_shift(cv::cudev::TexturePtr<uchar4> tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps)
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
size_t out_step, int cols, int rows,
int sp, int sr, int maxIter, float eps)
{ {
int isr2 = sr*sr; int isr2 = sr*sr;
uchar4 c = tex2D(tex_meanshift, x0, y0 ); uchar4 c = tex(y0, x0);
// iterate meanshift procedure // iterate meanshift procedure
for( int iter = 0; iter < maxIter; iter++ ) for( int iter = 0; iter < maxIter; iter++ )
@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device
int rowCount = 0; int rowCount = 0;
for( int x = minx; x <= maxx; x++ ) for( int x = minx; x <= maxx; x++ )
{ {
uchar4 t = tex2D( tex_meanshift, x, y ); uchar4 t = tex(y, x);
int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z); int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
if( norm2 <= isr2 ) if( norm2 <= isr2 )
@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device
return make_short2((short)x0, (short)y0); return make_short2((short)x0, (short)y0);
} }
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) __global__ void meanshift_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
{ {
int x0 = blockIdx.x * blockDim.x + threadIdx.x; int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y; int y0 = blockIdx.y * blockDim.y + threadIdx.y;
if( x0 < cols && y0 < rows ) if( x0 < cols && y0 < rows )
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
} }
void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream) void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, __global__ void meanshiftproc_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep,
unsigned char* outsp, size_t outspstep, int cols, int rows,int sp, int sr, int maxIter, float eps)
int cols, int rows,
int sp, int sr, int maxIter, float eps)
{ {
int x0 = blockIdx.x * blockDim.x + threadIdx.x; int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y; int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device
if( x0 < cols && y0 < rows ) if( x0 < cols && y0 < rows )
{ {
int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short); int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); *(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
} }
} }
@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }

@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
{ {
namespace gfft namespace gfft
{ {
int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream); int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream); void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
} }
}}} }}}
@ -120,31 +120,15 @@ namespace
cudaStream_t stream_ = StreamAccessor::getStream(stream); cudaStream_t stream_ = StreamAccessor::getStream(stream);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
//create texture object for findCorners_gpu and sortCorners_gpu int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModePoint;
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t eigTex_;
PtrStepSzf eig = eig_;
cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);
int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
if (total == 0) if (total == 0)
{ {
_corners.release(); _corners.release();
cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
return; return;
} }
sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_); sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
if (minDistance_ < 1) if (minDistance_ < 1)
{ {

@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Demosaicing // Demosaicing
struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo> struct Demosaicing : testing::TestWithParam<testing::tuple<cv::cuda::DeviceInfo, bool>>
{ {
cv::cuda::DeviceInfo devInfo; cv::cuda::DeviceInfo devInfo;
bool useRoi;
virtual void SetUp() virtual void SetUp()
{ {
devInfo = GetParam(); devInfo = GET_PARAM(0);
useRoi = GET_PARAM(1);
cv::cuda::setDevice(devInfo.deviceID()); cv::cuda::setDevice(devInfo.deviceID());
} }
@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
mosaic(img, src, cv::Point(1, 1)); mosaic(img, src, cv::Point(1, 1));
cv::cuda::GpuMat dst; cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT); cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3); EXPECT_MAT_SIMILAR(img, dst, 5e-3);
} }
@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
mosaic(img, src, cv::Point(0, 1)); mosaic(img, src, cv::Point(0, 1));
cv::cuda::GpuMat dst; cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT); cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3); EXPECT_MAT_SIMILAR(img, dst, 5e-3);
} }
@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
mosaic(img, src, cv::Point(0, 0)); mosaic(img, src, cv::Point(0, 0));
cv::cuda::GpuMat dst; cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT); cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3); EXPECT_MAT_SIMILAR(img, dst, 5e-3);
} }
@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
mosaic(img, src, cv::Point(1, 0)); mosaic(img, src, cv::Point(1, 0));
cv::cuda::GpuMat dst; cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT); cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3); EXPECT_MAT_SIMILAR(img, dst, 5e-3);
} }
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES); INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT));
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// swapChannels // swapChannels

@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughLines Probabilistic // HoughLines Probabilistic
PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi) PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi)
{ {
cv::cuda::DeviceInfo devInfo;
bool useRoi;
Size size;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
useRoi = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID());
}
static void generateLines(cv::Mat& img) static void generateLines(cv::Mat& img)
{ {
img.setTo(cv::Scalar::all(0)); img.setTo(cv::Scalar::all(0));
@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
CUDA_TEST_P(HoughLinesProbabilistic, Accuracy) CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
{ {
const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
cv::cuda::setDevice(devInfo.deviceID());
const cv::Size size = GET_PARAM(1);
const bool useRoi = GET_PARAM(2);
const float rho = 1.0f; const float rho = 1.0f;
const float theta = (float) (1.0 * CV_PI / 180.0); const float theta = (float) (1.0 * CV_PI / 180.0);
const int minLineLength = 15; const int minLineLength = 15;
@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
} }
void HoughLinesProbabilisticThread(const Ptr<HoughSegmentDetector> detector, const GpuMat& imgIn, const std::vector<GpuMat>& linesOut, Stream& stream) {
for (auto& lines : linesOut)
detector->detect(imgIn, lines, stream);
stream.waitForCompletion();
}
CUDA_TEST_P(HoughLinesProbabilistic, Async)
{
constexpr int nThreads = 5;
constexpr int nIters = 5;
vector<Stream> streams(nThreads); // async test only
vector<GpuMat> imgsIn;
vector<Ptr<HoughSegmentDetector>> detectors;
vector<vector<GpuMat>> linesOut(nThreads);
const float rho = 1.0f;
const float theta = (float)(1.0 * CV_PI / 180.0);
const int minLineLength = 15;
const int maxLineGap = 8;
cv::Mat src(size, CV_8UC1);
generateLines(src);
for (int i = 0; i < nThreads; i++) {
imgsIn.push_back(loadMat(src, useRoi));
detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap));
linesOut.push_back(vector<GpuMat>(nIters));
}
vector<std::thread> thread(nThreads);
for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i)));
for (int i = 0; i < nThreads; i++) thread.at(i).join();
for (int i = 0; i < nThreads; i++) {
std::vector<cv::Vec4i> linesSegment;
std::vector<cv::Vec2f> lines;
for (const auto& line : linesOut.at(i)) {
line.download(linesSegment);
cv::Mat dst(size, CV_8UC1);
drawLines(dst, linesSegment);
ASSERT_MAT_NEAR(src, dst, 0.0);
}
}
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine( INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
DIFFERENT_SIZES, DIFFERENT_SIZES,
WHOLE_SUBMAT)); WHOLE_SUBMAT));
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughCircles // HoughCircles

@ -49,4 +49,6 @@
#include "cvconfig.h" #include "cvconfig.h"
#include <thread>
#endif #endif

@ -119,9 +119,9 @@ typedef bool NcvBool;
typedef long long Ncv64s; typedef long long Ncv64s;
#if defined(__APPLE__) && !defined(__CUDACC__) #if defined(__APPLE__) && !defined(__CUDACC__)
typedef uint64_t Ncv64u; typedef uint64 Ncv64u;
#else #else
typedef unsigned long long Ncv64u; typedef uint64 Ncv64u;
#endif #endif
typedef int Ncv32s; typedef int Ncv32s;

@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
* \return NCV status code * \return NCV status code
*/ */
CV_EXPORTS CV_EXPORTS
NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
Ncv32f *pDst, Ncv32f *pDst,
@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep, Ncv32u nDstStep,
NcvRect32u oROI, NcvRect32u oROI,
NppStBorderType borderType, NppStBorderType borderType,
const Ncv32f *pKernel, Ncv32f *pKernel,
Ncv32s nKernelSize, Ncv32s nKernelSize,
Ncv32s nAnchor, Ncv32s nAnchor,
Ncv32f multiplier); Ncv32f multiplier);
@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
* \return NCV status code * \return NCV status code
*/ */
CV_EXPORTS CV_EXPORTS
NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
Ncv32f *pDst, Ncv32f *pDst,
@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep, Ncv32u nDstStep,
NcvRect32u oROI, NcvRect32u oROI,
NppStBorderType borderType, NppStBorderType borderType,
const Ncv32f *pKernel, Ncv32f *pKernel,
Ncv32s nKernelSize, Ncv32s nKernelSize,
Ncv32s nAnchor, Ncv32s nAnchor,
Ncv32f multiplier); Ncv32f multiplier);
@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
* \return NCV status code * \return NCV status code
*/ */
CV_EXPORTS CV_EXPORTS
NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
NcvRect32u srcROI, NcvRect32u srcROI,

@ -65,9 +65,12 @@
#include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp"
#include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp" #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
typedef NCVVectorAlloc<Ncv32f> FloatVector; typedef NCVVectorAlloc<Ncv32f> FloatVector;
typedef cv::cudev::TexturePtr<float> Ptr2D;
typedef cv::cudev::Texture<float> Texture;
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Implementation specific constants // Implementation specific constants
@ -84,39 +87,6 @@ inline int iDivUp(int a, int b)
return (a + b - 1)/b; return (a + b - 1)/b;
} }
/////////////////////////////////////////////////////////////////////////////////////////
// Texture references
/////////////////////////////////////////////////////////////////////////////////////////
texture<float, 2, cudaReadModeElementType> tex_coarse;
texture<float, 2, cudaReadModeElementType> tex_fine;
texture<float, 2, cudaReadModeElementType> tex_I1;
texture<float, 2, cudaReadModeElementType> tex_I0;
texture<float, 2, cudaReadModeElementType> tex_Ix;
texture<float, 2, cudaReadModeElementType> tex_Ixx;
texture<float, 2, cudaReadModeElementType> tex_Ix0;
texture<float, 2, cudaReadModeElementType> tex_Iy;
texture<float, 2, cudaReadModeElementType> tex_Iyy;
texture<float, 2, cudaReadModeElementType> tex_Iy0;
texture<float, 2, cudaReadModeElementType> tex_Ixy;
texture<float, 1, cudaReadModeElementType> tex_u;
texture<float, 1, cudaReadModeElementType> tex_v;
texture<float, 1, cudaReadModeElementType> tex_du;
texture<float, 1, cudaReadModeElementType> tex_dv;
texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
texture<float, 1, cudaReadModeElementType> tex_numerator_u;
texture<float, 1, cudaReadModeElementType> tex_numerator_v;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// SUPPLEMENTARY FUNCTIONS // SUPPLEMENTARY FUNCTIONS
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
///\param h number of rows in global memory array ///\param h number of rows in global memory array
///\param p global memory array pitch in floats ///\param p global memory array pitch in floats
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
template<int tex_id> __forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p)
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
{ {
//position within shared memory array //position within shared memory array
const int ijs = js * PSOR_PITCH + is; const int ijs = js * PSOR_PITCH + is;
@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
j = max(j, -j-1); j = max(j, -j-1);
j = min(j, h-j+h-1); j = min(j, h-j+h-1);
const int pos = j * p + i; const int pos = j * p + i;
switch(tex_id){ smem[ijs] = texSrc(pos);
case 0:
smem[ijs] = tex1Dfetch(tex_u, pos);
break;
case 1:
smem[ijs] = tex1Dfetch(tex_v, pos);
break;
case 2:
smem[ijs] = tex1Dfetch(tex_du, pos);
break;
case 3:
smem[ijs] = tex1Dfetch(tex_dv, pos);
break;
}
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
///\param h number of rows in global memory array ///\param h number of rows in global memory array
///\param p global memory array pitch in floats ///\param p global memory array pitch in floats
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
template<int tex> __forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p)
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
{ {
const int i = threadIdx.x + 2; const int i = threadIdx.x + 2;
const int j = threadIdx.y + 2; const int j = threadIdx.y + 2;
load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel
__syncthreads(); __syncthreads();
if(threadIdx.y < 2) if(threadIdx.y < 2)
{ {
//load bottom shadow elements //load bottom shadow elements
load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p); load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p);
if(threadIdx.x < 2) if(threadIdx.x < 2)
{ {
//load bottom right shadow elements //load bottom right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p); load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle right shadow elements //load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
} }
else if(threadIdx.x >= PSOR_TILE_WIDTH-2) else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{ {
//load bottom left shadow elements //load bottom left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p); load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle left shadow elements //load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
} }
} }
else if(threadIdx.y >= PSOR_TILE_HEIGHT-2) else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
{ {
//load upper shadow elements //load upper shadow elements
load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p); load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p);
if(threadIdx.x < 2) if(threadIdx.x < 2)
{ {
//load upper right shadow elements //load upper right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p); load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle right shadow elements //load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
} }
else if(threadIdx.x >= PSOR_TILE_WIDTH-2) else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{ {
//load upper left shadow elements //load upper left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p); load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle left shadow elements //load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
} }
} }
else else
@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
if(threadIdx.x < 2) if(threadIdx.x < 2)
{ {
//load middle right shadow elements //load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
} }
else if(threadIdx.x >= PSOR_TILE_WIDTH-2) else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{ {
//load middle left shadow elements //load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
} }
} }
__syncthreads(); __syncthreads();
@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
/// \param alpha (in) alpha in Brox model (flow smoothness) /// \param alpha (in) alpha in Brox model (flow smoothness)
/// \param gamma (in) gamma in Brox model (edge importance) /// \param gamma (in) gamma in Brox model (edge importance)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy,
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v,
float *denominator_u, float *denominator_v, int w, int h, int s, float alpha, float gamma)
float *numerator_dudv,
float *numerator_u, float *numerator_v,
int w, int h, int s,
float alpha, float gamma)
{ {
__shared__ float u[PSOR_PITCH * PSOR_HEIGHT]; __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
__shared__ float v[PSOR_PITCH * PSOR_HEIGHT]; __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
float x = (float)ig + 0.5f; float x = (float)ig + 0.5f;
float y = (float)jg + 0.5f; float y = (float)jg + 0.5f;
//load u and v to smem //load u and v to smem
load_array<0>(u, ig, jg, w, h, s); load_array(texU, u, ig, jg, w, h, s);
load_array<1>(v, ig, jg, w, h, s); load_array(texV, v, ig, jg, w, h, s);
load_array<2>(du, ig, jg, w, h, s); load_array(texDu, du, ig, jg, w, h, s);
load_array<3>(dv, ig, jg, w, h, s); load_array(texDv, dv, ig, jg, w, h, s);
//warped position //warped position
float wx = (x + u[ijs])/(float)w; float wx = (x + u[ijs])/(float)w;
float wy = (y + v[ijs])/(float)h; float wy = (y + v[ijs])/(float)h;
x /= (float)w; x /= (float)w;
y /= (float)h; y /= (float)h;
//compute image derivatives //compute image derivatives
const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y); const float Iz = texI1(wy, wx) - texI0(y,x);
const float Ix = tex2D(tex_Ix, wx, wy); const float Ix = texIx(wy, wx);
const float Ixz = Ix - tex2D(tex_Ix0, x, y); const float Ixz = Ix - texIx0(y, x);
const float Ixy = tex2D(tex_Ixy, wx, wy); const float Ixy = texIxy(wy, wx);
const float Ixx = tex2D(tex_Ixx, wx, wy); const float Ixx = texIxx(wy, wx);
const float Iy = tex2D(tex_Iy, wx, wy); const float Iy = texIy(wy, wx);
const float Iyz = Iy - tex2D(tex_Iy0, x, y); const float Iyz = Iy - texIy0(y, x);
const float Iyy = tex2D(tex_Iyy, wx, wy); const float Iyy = texIyy(wy, wx);
//compute data term //compute data term
float q0, q1, q2; float q0, q1, q2;
q0 = Iz + Ix * du[ijs] + Iy * dv[ijs]; q0 = Iz + Ix * du[ijs] + Iy * dv[ijs];
@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
///\param h ///\param h
///\param s ///\param s
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v, __global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s)
int w, int h, int s)
{ {
__shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
__shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
} }
if(inside) if(inside)
{ {
sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg); sx[ijs] = texDiffX(ijg);
sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg); sy[ijs] = texDiffY(ijg);
} }
else else
{ {
@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
if(j == PSOR_TILE_HEIGHT-1) if(j == PSOR_TILE_HEIGHT-1)
{ {
if(jg < h-1 && inside) if(jg < h-1 && inside)
{ sy[up] = texDiffY(ijg + s);
sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
}
else else
{
sy[up] = 0.0f; sy[up] = 0.0f;
}
} }
int right = ijs + 1; int right = ijs + 1;
if(threadIdx.x == PSOR_TILE_WIDTH-1) if(threadIdx.x == PSOR_TILE_WIDTH-1)
{ {
if(ig < w-1 && inside) if(ig < w-1 && inside)
{ sx[right] = texDiffX(ijg + 1);
sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
}
else else
{
sx[right] = 0.0f; sx[right] = 0.0f;
}
} }
__syncthreads(); __syncthreads();
float diffusivity_sum; float diffusivity_sum;
@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
// Red-Black SOR // Red-Black SOR
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
template<int isBlack> __global__ void sor_pass(float *new_du, template<int isBlack> __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u,
float *new_dv, const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride)
const float *g_inv_denominator_u,
const float *g_inv_denominator_v,
const float *g_numerator_u,
const float *g_numerator_v,
const float *g_numerator_dudv,
float omega,
int width,
int height,
int stride)
{ {
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y; int j = blockIdx.y * blockDim.y + threadIdx.y;
@ -560,14 +493,14 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
//load smooth term //load smooth term
float s_up, s_left, s_right, s_down; float s_up, s_left, s_right, s_down;
s_left = tex1Dfetch(tex_diffusivity_x, pos); s_left = texDiffX(pos);
s_down = tex1Dfetch(tex_diffusivity_y, pos); s_down = texDiffY(pos);
if(i < width-1) if(i < width-1)
s_right = tex1Dfetch(tex_diffusivity_x, pos_r); s_right = texDiffX(pos_r);
else else
s_right = 0.0f; //Neumann BC s_right = 0.0f; //Neumann BC
if(j < height-1) if(j < height-1)
s_up = tex1Dfetch(tex_diffusivity_y, pos_u); s_up = texDiffY(pos_u);
else else
s_up = 0.0f; //Neumann BC s_up = 0.0f; //Neumann BC
@ -577,30 +510,29 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
float du_up, du_left, du_right, du_down, du; float du_up, du_left, du_right, du_down, du;
float dv_up, dv_left, dv_right, dv_down, dv; float dv_up, dv_left, dv_right, dv_down, dv;
u_left = tex1Dfetch(tex_u, pos_l); u_left = texU(pos_l);
u_right = tex1Dfetch(tex_u, pos_r); u_right = texU(pos_r);
u_down = tex1Dfetch(tex_u, pos_d); u_down = texU(pos_d);
u_up = tex1Dfetch(tex_u, pos_u); u_up = texU(pos_u);
u = tex1Dfetch(tex_u, pos); u = texU(pos);
v_left = tex1Dfetch(tex_v, pos_l); v_left = texV(pos_l);
v_right = tex1Dfetch(tex_v, pos_r); v_right = texV(pos_r);
v_down = tex1Dfetch(tex_v, pos_d); v_down = texV(pos_d);
v = tex1Dfetch(tex_v, pos); v = texV(pos);
v_up = tex1Dfetch(tex_v, pos_u); v_up = texV(pos_u);
du = tex1Dfetch(tex_du, pos); du = texDu(pos);
du_left = tex1Dfetch(tex_du, pos_l); du_left = texDu(pos_l);
du_right = tex1Dfetch(tex_du, pos_r); du_right = texDu(pos_r);
du_down = tex1Dfetch(tex_du, pos_d); du_down = texDu(pos_d);
du_up = tex1Dfetch(tex_du, pos_u); du_up = texDu(pos_u);
dv = tex1Dfetch(tex_dv, pos); dv = texDv(pos);
dv_left = tex1Dfetch(tex_dv, pos_l); dv_left = texDv(pos_l);
dv_right = tex1Dfetch(tex_dv, pos_r); dv_right = texDv(pos_r);
dv_down = tex1Dfetch(tex_dv, pos_d); dv_down = texDv(pos_d);
dv_up = tex1Dfetch(tex_dv, pos_u); dv_up = texDv(pos_u);
float numerator_dudv = g_numerator_dudv[pos]; float numerator_dudv = g_numerator_dudv[pos];
if((i+j)%2 == isBlack) if((i+j)%2 == isBlack)
@ -624,52 +556,6 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// utility functions // utility functions
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
}
void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeMirror;
tex.addressMode[1] = cudaAddressModeMirror;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
}
void InitTextures()
{
initTexture2D(tex_I0);
initTexture2D(tex_I1);
initTexture2D(tex_fine); // for downsampling
initTexture2D(tex_coarse); // for prolongation
initTexture2D(tex_Ix);
initTexture2D(tex_Ixx);
initTexture2D(tex_Ix0);
initTexture2D(tex_Iy);
initTexture2D(tex_Iyy);
initTexture2D(tex_Iy0);
initTexture2D(tex_Ixy);
initTexture1D(tex_u);
initTexture1D(tex_v);
initTexture1D(tex_du);
initTexture1D(tex_dv);
initTexture1D(tex_diffusivity_x);
initTexture1D(tex_diffusivity_y);
initTexture1D(tex_inv_denominator_u);
initTexture1D(tex_inv_denominator_v);
initTexture1D(tex_numerator_dudv);
initTexture1D(tex_numerator_u);
initTexture1D(tex_numerator_v);
}
namespace namespace
{ {
struct ImagePyramid struct ImagePyramid
@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice), NCV_CUDA_ERROR); cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
InitTextures();
} }
//prepare image pyramid //prepare image pyramid
@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
//select images with lowest resolution //select images with lowest resolution
size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
FloatVector* ptrU = &u; FloatVector* ptrU = &u;
@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
//texture format descriptor
cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
I0 = *img0Iter; I0 = *img0Iter;
I1 = *img1Iter; I1 = *img1Iter;
++img0Iter; ++img0Iter;
++img1Iter; ++img1Iter;
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
//compute derivatives //compute derivatives
dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6)); dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes);
Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes);
// flow // flow
ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes);
// flow increments // flow increments
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes);
Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes);
Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes);
dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT)); dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT); dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration) for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
{ {
//compute coefficients //compute coefficients
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy,
(diffusivity_x.ptr(), diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma);
diffusivity_y.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_dudv.ptr(),
num_u.ptr(),
num_v.ptr(),
kLevelWidth,
kLevelHeight,
kLevelStride,
alpha,
gamma);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
// linear system coefficients
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
//solve linear system //solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{ {
float omega = 1.99f; float omega = 1.99f;
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(),
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
(du_new.ptr(),
dv_new.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
(du.ptr(),
dv.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(),
num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}//end of solver loop }//end of solver loop
}// end of inner loop }// end of inner loop

@ -72,6 +72,7 @@
#include "opencv2/cudalegacy/NCV.hpp" #include "opencv2/cudalegacy/NCV.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp"
#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp" #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include "NCVRuntimeTemplates.hpp" #include "NCVRuntimeTemplates.hpp"
#include "NCVAlg.hpp" #include "NCVAlg.hpp"
@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
#define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2) #define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
/** \internal
* Haar features solid array.
*/
texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
/** \internal
* Haar classifiers flattened trees container.
* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
* Drawback: breaks tree locality (might cause more cache misses
* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
*/
texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages) __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
{ {
return d_Stages[iStage]; return d_Stages[iStage];
@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
template <NcvBool tbCacheTextureCascade> template <NcvBool tbCacheTextureCascade>
__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes) __device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
{ {
HaarClassifierNode128 tmpNode; HaarClassifierNode128 tmpNode;
if (tbCacheTextureCascade) if (tbCacheTextureCascade)
{ tmpNode._ui4 = texHaarClassifierNodes(iNode);
tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
}
else else
{
tmpNode = d_ClassifierNodes[iNode]; tmpNode = d_ClassifierNodes[iNode];
}
return tmpNode; return tmpNode;
} }
template <NcvBool tbCacheTextureCascade> template <NcvBool tbCacheTextureCascade>
__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, __device__ void getFeature(cv::cudev::TexturePtr<uint2> texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight)
Ncv32f *weight,
Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
{ {
HaarFeature64 feature; HaarFeature64 feature;
if (tbCacheTextureCascade) if (tbCacheTextureCascade)
{ feature._ui2 = texHaarFeatures(iFeature);
feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
}
else else
{
feature = d_Features[iFeature]; feature = d_Features[iFeature];
}
feature.getRect(rectX, rectY, rectWidth, rectHeight); feature.getRect(rectX, rectY, rectWidth, rectHeight);
*weight = feature.getWeight(); *weight = feature.getWeight();
} }
template <NcvBool tbCacheTextureIImg> template <NcvBool tbCacheTextureIImg>
__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) __device__ Ncv32u getElemIImg(cv::cudev::TexturePtr<Ncv32u> texImg, Ncv32u x, Ncv32u *d_IImg)
{ {
if (tbCacheTextureIImg) if (tbCacheTextureIImg)
{ return texImg(x);
return tex1Dfetch(texIImage, x);
}
else else
{
return d_IImg[x]; return d_IImg[x];
}
} }
@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
} }
template <NcvBool tbInitMaskPositively, template <NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction>
NcvBool tbCacheTextureIImg, __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes,
NcvBool tbCacheTextureCascade, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask,
NcvBool tbReadPixelIndexFromVector, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{ {
Ncv32u y_offs; Ncv32u y_offs;
Ncv32u x_offs; Ncv32u x_offs;
@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
{ {
while (bMoreNodesToTraverse) while (bMoreNodesToTraverse)
{ {
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes); HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset(); Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
{ {
Ncv32f rectWeight; Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight; Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade> getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth; Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth; Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) - Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) + getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) - getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg); getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
} }
template <NcvBool tbCacheTextureIImg, template <NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction>
NcvBool tbCacheTextureCascade, __global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
NcvBool tbDoAtomicCompaction> Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{ {
Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x; Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
while (bMoreNodesToTraverse) while (bMoreNodesToTraverse)
{ {
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes); HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset(); Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
{ {
Ncv32f rectWeight; Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight; Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade> getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth; Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth; Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) - Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) + getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) - getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg); getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor
{ {
dim3 gridConf, blockConf; dim3 gridConf, blockConf;
cudaStream_t cuStream; cudaStream_t cuStream;
cv::cudev::TexturePtr<Ncv32u> texImg;
//Kernel arguments are stored as members; cv::cudev::TexturePtr<uint2> texHaarFeatures;
cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
Ncv32u *d_IImg; Ncv32u *d_IImg;
Ncv32u IImgStride; Ncv32u IImgStride;
Ncv32f *d_weights; Ncv32f *d_weights;
@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor
Ncv32f scaleArea; Ncv32f scaleArea;
//Arguments are passed through the constructor //Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, dim3 _gridConf,
Ncv32u *_d_IImg, Ncv32u _IImgStride, dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes,
Ncv32f *_d_weights, Ncv32u _weightsStride, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) :
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride),
Ncv32u *_d_inMask, Ncv32u *_d_outMask, d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen),
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
{} {}
template<class TList> template<class TList>
@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor
Loki::TL::TypeAt<TList, 2>::Result::value, Loki::TL::TypeAt<TList, 2>::Result::value,
Loki::TL::TypeAt<TList, 3>::Result::value, Loki::TL::TypeAt<TList, 3>::Result::value,
Loki::TL::TypeAt<TList, 4>::Result::value > Loki::TL::TypeAt<TList, 4>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>> <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
(d_IImg, IImgStride, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
} }
}; };
void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction,
NcvBool tbCacheTextureIImg, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
NcvBool tbCacheTextureCascade, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
NcvBool tbReadPixelIndexFromVector, Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
{ {
applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages,
applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
//Second parameter is the number of "dynamic" template parameters //Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor> NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor
{ {
dim3 gridConf, blockConf; dim3 gridConf, blockConf;
cudaStream_t cuStream; cudaStream_t cuStream;
cv::cudev::TexturePtr<Ncv32u> texImg;
//Kernel arguments are stored as members; cv::cudev::TexturePtr<uint2> texHaarFeatures;
cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
Ncv32u *d_IImg; Ncv32u *d_IImg;
Ncv32u IImgStride; Ncv32u IImgStride;
Ncv32f *d_weights; Ncv32f *d_weights;
@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor
Ncv32f scaleArea; Ncv32f scaleArea;
//Arguments are passed through the constructor //Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_,
Ncv32u *_d_IImg, Ncv32u _IImgStride, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features,
Ncv32f *_d_weights, Ncv32u _weightsStride, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi,
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_),
Ncv32u *_d_inMask, Ncv32u *_d_outMask, texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features),
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi),
NcvSize32u _anchorsRoi, Ncv32u _startStageInc, startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
{} {}
template<class TList> template<class TList>
@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor
Loki::TL::TypeAt<TList, 0>::Result::value, Loki::TL::TypeAt<TList, 0>::Result::value,
Loki::TL::TypeAt<TList, 1>::Result::value, Loki::TL::TypeAt<TList, 1>::Result::value,
Loki::TL::TypeAt<TList, 2>::Result::value > Loki::TL::TypeAt<TList, 2>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>> <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
(d_IImg, IImgStride, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
} }
}; };
void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
NcvBool tbCacheTextureCascade, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights,
NcvBool tbDoAtomicCompaction, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
{ {
applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features,
d_IImg, IImgStride, d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
//Second parameter is the number of "dynamic" template parameters //Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor> NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
cv::cudev::Texture<Ncv32u> texImg;
if (bTexCacheIImg) if (bTexCacheIImg)
{ texImg = cv::cudev::Texture<Ncv32u>((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr());
cudaChannelFormatDesc cfdTexIImage;
cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
size_t alignmentOffset; cv::cudev::Texture<uint2> texHaarFeatures;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage, cv::cudev::Texture<uint4> texHaarClassifierNodes;
(anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR); if (bTexCacheCascade) {
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); texHaarFeatures = cv::cudev::Texture<uint2>(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast<uint2*>(d_HaarFeatures.ptr()));
} texHaarClassifierNodes = cv::cudev::Texture<uint4>(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast<uint4*>(d_HaarNodes.ptr()));
if (bTexCacheCascade)
{
cudaChannelFormatDesc cfdTexHaarFeatures;
cudaChannelFormatDesc cfdTexHaarClassifierNodes;
cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
} }
Ncv32u stageStartAnchorParallel = 0; Ncv32u stageStartAnchorParallel = 0;
@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
anchorsRoi.height); anchorsRoi.height);
dim3 block1(NUM_THREADS_ANCHORSPARALLEL); dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate( applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream,
true, //tbInitMaskPositively texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
bTexCacheIImg, //tbCacheTextureIImg d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex],
bTexCacheCascade, //tbCacheTextureCascade pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid1,
block1,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
0,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction) if (bDoAtomicCompaction)
@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
} }
dim3 block2(NUM_THREADS_ANCHORSPARALLEL); dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate( applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction,
false, //tbInitMaskPositively grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(),
bTexCacheIImg, //tbCacheTextureIImg d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi,
bTexCacheCascade, //tbCacheTextureCascade pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid2,
block2,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction) if (bDoAtomicCompaction)
@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
} }
dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL); dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
applyHaarClassifierClassifierParallelDynTemplate( applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(),
bTexCacheIImg, //tbCacheTextureIImg d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections,
bTexCacheCascade, //tbCacheTextureCascade d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels);
bDoAtomicCompaction, //tbDoAtomicCompaction
grid3,
block3,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
stageMiddleSwitch,
stageEndClassifierParallel,
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction) if (bDoAtomicCompaction)

@ -48,12 +48,7 @@
#include "opencv2/cudev.hpp" #include "opencv2/cudev.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
texture<uint2, 1, cudaReadModeElementType> tex64u;
//============================================================================== //==============================================================================
// //
@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void)
} }
cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
{ {
cudaStream_t tmp = nppStream; cudaStream_t tmp = nppStream;
@ -117,25 +111,25 @@ private:
template<class T> template<class T>
inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs); inline __device__ T readElem(cv::cudev::TexturePtr<Ncv8u> tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
template<> template<>
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv8u readElem<Ncv8u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs); return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs);
} }
template<> template<>
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv32u readElem<Ncv32u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return d_src[curElemOffs]; return d_src[curElemOffs];
} }
template<> template<>
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv32f readElem<Ncv32f>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return d_src[curElemOffs]; return d_src[curElemOffs];
} }
@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u
* \return None * \return None
*/ */
template <class T_in, class T_out, bool tbDoSqr> template <class T_in, class T_out, bool tbDoSqr>
__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride)
T_out *d_II, Ncv32u IIstride)
{ {
//advance pointers to the current line //advance pointers to the current line
if (sizeof(T_in) != 1) if (sizeof(T_in) != 1)
@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
if (curElemOffs < srcWidth) if (curElemOffs < srcWidth)
{ {
//load elements //load elements
curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs); curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
} }
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem); curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
@ -224,25 +217,9 @@ template <bool tbDoSqr, class T_in, class T_out>
NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride, NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi) T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
{ {
cudaChannelFormatDesc cfdTex; cv::cudev::Texture<Ncv8u> tex8u(static_cast<size_t>(roi.height * srcStride), (Ncv8u*)d_src);
size_t alignmentOffset = 0; scanRows <T_in, T_out, tbDoSqr> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride);
if (sizeof(T_in) == 1)
{
cfdTex = cudaCreateChannelDesc<Ncv8u>();
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
if (alignmentOffset > 0)
{
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
}
}
scanRows
<T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32; const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8; const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
template <class T>
template<class T, NcvBool tbCacheTexture> __global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
__device__ T getElem_Decimate(Ncv32u x, T *d_src);
template<>
__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
{
return tex1Dfetch(tex32u, x);
}
template<>
__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
{
return d_src[x];
}
template<>
__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
{
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
res <<= 32;
res |= tmp.x;
return res;
}
template<>
__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
{ {
return d_src[x]; int curX = blockIdx.x * blockDim.x + threadIdx.x;
int curY = blockIdx.y * blockDim.y + threadIdx.y;
if (curX >= dstRoi.width || curY >= dstRoi.height) return;
d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale];
} }
template <class T>
template <class T, NcvBool tbCacheTexture> __global__ void decimate_C1R(cv::cudev::TexturePtr<T> texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep,
__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
NcvSize32u dstRoi, Ncv32u scale)
{ {
int curX = blockIdx.x * blockDim.x + threadIdx.x; int curX = blockIdx.x * blockDim.x + threadIdx.x;
int curY = blockIdx.y * blockDim.y + threadIdx.y; int curY = blockIdx.y * blockDim.y + threadIdx.y;
if (curX >= dstRoi.width || curY >= dstRoi.height) return;
if (curX >= dstRoi.width || curY >= dstRoi.height) d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale);
{
return;
}
d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
} }
template <class T> template <class T>
static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep, static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
T *d_dst, Ncv32u dstStep, T *d_dst, Ncv32u dstStep,
@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X, dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
(dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y); (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y); dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
if (!readThruTexture) {
if (!readThruTexture) decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
{
decimate_C1R
<T, false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
} }
else else {
{ cv::cudev::Texture<T> texSrc(srcRoi.height * srcStep * sizeof(T), d_src);
cudaChannelFormatDesc cfdTexSrc; decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSrc, srcStep, d_dst, dstStep, dstRoi, scale);
if (sizeof(T) == sizeof(Ncv32u))
{
cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
}
else
{
cfdTexSrc = cudaCreateChannelDesc<uint2>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
}
decimate_C1R
<T, true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
} }
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
implementNppDecimate(32, u) implementNppDecimate(32, u)
implementNppDecimate(32, s)
implementNppDecimate(32, f)
implementNppDecimate(64, u) implementNppDecimate(64, u)
implementNppDecimate(64, s)
implementNppDecimate(64, f)
implementNppDecimateHost(32, u) implementNppDecimateHost(32, u)
implementNppDecimateHost(32, s) implementNppDecimateHost(32, s)
implementNppDecimateHost(32, f) implementNppDecimateHost(32, f)
@ -776,43 +688,29 @@ implementNppDecimateHost(64, f)
const Ncv32u NUM_RECTSTDDEV_THREADS = 128; const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
template <NcvBool tbCacheTexture> template <NcvBool tbCacheTexture, class Ptr2D>
__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum) __device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum)
{ {
if (tbCacheTexture) if (tbCacheTexture)
{ return tex(x);
return tex1Dfetch(tex32u, x);
}
else else
{
return d_sum[x]; return d_sum[x];
}
} }
template <NcvBool tbCacheTexture> template <NcvBool tbCacheTexture, class Ptr2D>
__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum) __device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum)
{ {
if (tbCacheTexture) if (tbCacheTexture)
{ return tex(x);
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
res <<= 32;
res |= tmp.x;
return res;
}
else else
{
return d_sqsum[x]; return d_sqsum[x];
}
} }
template <NcvBool tbCacheTexture> template <NcvBool tbCacheTexture>
__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, __global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr<Ncv32u> texSum, cv::cudev::TexturePtr<Ncv64u> texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep,
Ncv64u *d_sqsum, Ncv32u sqsumStep, Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
Ncv32f *d_norm, Ncv32u normStep,
NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
{ {
Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x; Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
if (x_offs >= roi.width) if (x_offs >= roi.width)
@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs; Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
//OPT: try swapping order (could change cache hit/miss ratio) //OPT: try swapping order (could change cache hit/miss ratio)
Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum); Ncv32u sum_tl = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum);
Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum); Ncv32u sum_bl = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum); Ncv32u sum_tr = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum); Ncv32u sum_br = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl; Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br; Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum); sqsum_tl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum); sqsum_bl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum); sqsum_tr = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum); sqsum_br = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl; Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
Ncv32f mean = sum_val * invRectArea; Ncv32f mean = sum_val * invRectArea;
@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height); dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
dim3 block(NUM_RECTSTDDEV_THREADS); dim3 block(NUM_RECTSTDDEV_THREADS);
cv::cudev::Texture<Ncv32u> texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum);
cv::cudev::Texture<Ncv64u> texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum);
if (!readThruTexture) if (!readThruTexture)
{ rectStdDev_32f_C1R<false><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
rectStdDev_32f_C1R
<false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
}
else else
{ rectStdDev_32f_C1R<true><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
cudaChannelFormatDesc cfdTexSrc;
cudaChannelFormatDesc cfdTexSqr;
cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
cfdTexSqr = cudaCreateChannelDesc<uint2>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
rectStdDev_32f_C1R
<true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
}
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
// //
//============================================================================== //==============================================================================
__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w)
texture <float, 1, cudaReadModeElementType> texSrc;
texture <float, 1, cudaReadModeElementType> texKernel;
__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
int i,
int w)
{ {
if (i < 0) i = 1 - i; if (i < 0) i = 1 - i;
if (i >= w) i = w + w - i - 1; if (i >= w) i = w + w - i - 1;
return tex1Dfetch (texSrc, rowOffset + i); return tex(rowOffset + i);
} }
__forceinline__ __device__ float getValueMirrorColumn(const int offset, __forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h)
const int rowStep,
int j,
int h)
{ {
if (j < 0) j = 1 - j; if (j < 0) j = 1 - j;
if (j >= h) j = h + h - j - 1; if (j >= h) j = h + h - j - 1;
return tex1Dfetch (texSrc, offset + j * rowStep); return tex(offset + j * rowStep);
} }
__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep, __global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
Ncv32f *pDst, NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{ {
// position within ROI // position within ROI
const int ix = blockDim.x * blockIdx.x + threadIdx.x; const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
float sum = 0.0f; float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m) for (int m = 0; m < nKernelSize; ++m)
{ {
sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width) sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width)
* tex1Dfetch (texKernel, m); * texKernel1(m);
} }
pDst[iy * dstStep + ix] = sum * multiplier; pDst[iy * dstStep + ix] = sum * multiplier;
} }
__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep, __global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
Ncv32f *pDst, NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{ {
const int ix = blockDim.x * blockIdx.x + threadIdx.x; const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y; const int iy = blockDim.y * blockIdx.y + threadIdx.y;
@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
float sum = 0.0f; float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m) for (int m = 0; m < nKernelSize; ++m)
{ {
sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height) sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height)
* tex1Dfetch (texKernel, m); * texKernel(m);
} }
pDst[ix + iy * dstStep] = sum * multiplier; pDst[ix + iy * dstStep] = sum * multiplier;
} }
NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
Ncv32f *pDst, Ncv32f *pDst,
@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep, Ncv32u nDstStep,
NcvRect32u oROI, NcvRect32u oROI,
NppStBorderType borderType, NppStBorderType borderType,
const Ncv32f *pKernel, Ncv32f *pKernel,
Ncv32s nKernelSize, Ncv32s nKernelSize,
Ncv32s nAnchor, Ncv32s nAnchor,
Ncv32f multiplier) Ncv32f multiplier)
@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
oROI.height = srcSize.height - oROI.y; oROI.height = srcSize.height - oROI.y;
} }
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> (); cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
texSrc.normalized = false; cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
dim3 ctaSize (32, 6); dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderWrap: case nppStBorderWrap:
return NPPST_ERROR; return NPPST_ERROR;
case nppStBorderMirror: case nppStBorderMirror:
FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break; break;
default: default:
@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
} }
NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
Ncv32f *pDst, Ncv32f *pDst,
@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep, Ncv32u nDstStep,
NcvRect32u oROI, NcvRect32u oROI,
NppStBorderType borderType, NppStBorderType borderType,
const Ncv32f *pKernel, Ncv32f *pKernel,
Ncv32s nKernelSize, Ncv32s nKernelSize,
Ncv32s nAnchor, Ncv32s nAnchor,
Ncv32f multiplier) Ncv32f multiplier)
@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
oROI.height = srcSize.height - oROI.y; oROI.height = srcSize.height - oROI.y;
} }
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> (); cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
texSrc.normalized = false; cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
dim3 ctaSize (32, 6); dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderWrap: case nppStBorderWrap:
return NPPST_ERROR; return NPPST_ERROR;
case nppStBorderMirror: case nppStBorderMirror:
FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break; break;
default: default:
@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
return (num + denom - 1)/denom; return (num + denom - 1)/denom;
} }
__global__ void BlendFramesKernel(cv::cudev::TexturePtr<Ncv32f> texSrc0, cv::cudev::TexturePtr<Ncv32f> texSrc1,
texture<float, 2, cudaReadModeElementType> tex_src1; const float *u, const float *v, // forward flow
texture<float, 2, cudaReadModeElementType> tex_src0; const float *ur, const float *vr, // backward flow
const float *o0, const float *o1, // coverage masks
int w, int h, int s, float theta, float *out)
__global__ void BlendFramesKernel(const float *u, const float *v, // forward flow
const float *ur, const float *vr, // backward flow
const float *o0, const float *o1, // coverage masks
int w, int h, int s,
float theta, float *out)
{ {
const int ix = threadIdx.x + blockDim.x * blockIdx.x; const int ix = threadIdx.x + blockDim.x * blockIdx.x;
const int iy = threadIdx.y + blockDim.y * blockIdx.y; const int iy = threadIdx.y + blockDim.y * blockIdx.y;
@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v, // forward f
bool b0 = o0[pos] > 1e-4f; bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f; bool b1 = o1[pos] > 1e-4f;
if (b0 && b1) if (b0 && b1) // pixel is visible on both frames
{ out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta;
// pixel is visible on both frames else if (b0) // visible on the first frame only
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) + out[pos] = texSrc0(y - _v * theta, x - _u * theta);
tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta; else // visible on the second frame only
} out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta));
else if (b0)
{
// visible on the first frame only
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
}
else
{
// visible on the second frame only
out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
}
} }
NCVStatus BlendFrames(const Ncv32f *src0, NCVStatus BlendFrames(Ncv32f *src0,
const Ncv32f *src1, Ncv32f *src1,
const Ncv32f *ufi, const Ncv32f *ufi,
const Ncv32f *vfi, const Ncv32f *vfi,
const Ncv32f *ubi, const Ncv32f *ubi,
@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0,
Ncv32f theta, Ncv32f theta,
Ncv32f *out) Ncv32f *out)
{ {
tex_src1.addressMode[0] = cudaAddressModeClamp;
tex_src1.addressMode[1] = cudaAddressModeClamp;
tex_src1.filterMode = cudaFilterModeLinear;
tex_src1.normalized = false;
tex_src0.addressMode[0] = cudaAddressModeClamp;
tex_src0.addressMode[1] = cudaAddressModeClamp;
tex_src0.filterMode = cudaFilterModeLinear;
tex_src0.normalized = false;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
const Ncv32u pitch = stride * sizeof (float); const Ncv32u pitch = stride * sizeof (float);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); cv::cudev::Texture<Ncv32f> texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); cv::cudev::Texture<Ncv32f> texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear);
dim3 threads (32, 4); dim3 threads (32, 4);
dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y)); dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
(ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
// //
//============================================================================== //==============================================================================
texture <float, 2, cudaReadModeElementType> texSrc2D;
__forceinline__ __forceinline__
__device__ float processLine(int spos, __device__ float processLine(cv::cudev::TexturePtr<Ncv32f> tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax)
float xmin,
float xmax,
int ixmin,
int ixmax,
float fxmin,
float cxmax)
{ {
// first element // first element
float wsum = 1.0f - xmin + fxmin; float wsum = 1.0f - xmin + fxmin;
float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin); float sum = tex( spos) * (1.0f - xmin + fxmin);
spos++; spos++;
for (int ix = ixmin + 1; ix < ixmax; ++ix) for (int ix = ixmin + 1; ix < ixmax; ++ix)
{ {
sum += tex1Dfetch(texSrc, spos); sum += tex(spos);
spos++; spos++;
wsum += 1.0f; wsum += 1.0f;
} }
sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax); sum += tex(spos) * (cxmax - xmax);
wsum += cxmax - xmax; wsum += cxmax - xmax;
return sum / wsum; return sum / wsum;
} }
__global__ void resizeSuperSample_32f(NcvSize32u srcSize, __global__ void resizeSuperSample_32f(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep,
Ncv32u srcStep, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
NcvRect32u srcROI,
Ncv32f *dst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
{ {
// position within dst ROI // position within dst ROI
const int ix = blockIdx.x * blockDim.x + threadIdx.x; const int ix = blockIdx.x * blockDim.x + threadIdx.x;
@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
float wsum = 1.0f - yBegin + floorYBegin; float wsum = 1.0f - yBegin + floorYBegin;
float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (1.0f - yBegin + floorYBegin); ceilXEnd) * (1.0f - yBegin + floorYBegin);
pos += srcStep; pos += srcStep;
for (int iy = iYBegin + 1; iy < iYEnd; ++iy) for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
{ {
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd); ceilXEnd);
pos += srcStep; pos += srcStep;
wsum += 1.0f; wsum += 1.0f;
} }
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (ceilYEnd - yEnd); ceilXEnd) * (ceilYEnd - yEnd);
wsum += ceilYEnd - yEnd; wsum += ceilYEnd - yEnd;
sum /= wsum; sum /= wsum;
@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_)
} }
__global__ void resizeBicubic(NcvSize32u srcSize, __global__ void resizeBicubic(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
NcvRect32u srcROI,
NcvSize32u dstSize,
Ncv32u dstStep,
Ncv32f *dst,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
{ {
const int ix = blockIdx.x * blockDim.x + threadIdx.x; const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y; const int iy = blockIdx.y * blockDim.y + threadIdx.y;
@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
float wx = bicubicCoeff (xDist); float wx = bicubicCoeff (xDist);
float wy = bicubicCoeff (yDist); float wy = bicubicCoeff (yDist);
wx *= wy; wx *= wy;
sum += wx * tex2D (texSrc2D, cx * dx, cy * dy); sum += wx * texSrc(cy * dy, cx * dx);
wsum += wx; wsum += wx;
} }
} }
@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
} }
NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize, NcvSize32u srcSize,
Ncv32u nSrcStep, Ncv32u nSrcStep,
NcvRect32u srcROI, NcvRect32u srcROI,
@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
if (interpolation == nppStSupersample) if (interpolation == nppStSupersample)
{ {
// bind texture cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
// invoke kernel
dim3 ctaSize (32, 6); dim3 ctaSize (32, 6);
dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x, dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y);
(dstROI.height + ctaSize.y - 1) / ctaSize.y); resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
} }
else if (interpolation == nppStBicubic) else if (interpolation == nppStBicubic)
{ {
texSrc2D.addressMode[0] = cudaAddressModeMirror; cv::cudev::Texture<float> texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror);
texSrc2D.addressMode[1] = cudaAddressModeMirror;
texSrc2D.normalized = true;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
nSrcStep);
dim3 ctaSize (32, 6); dim3 ctaSize (32, 6);
dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y);
(dstSize.height + ctaSize.y - 1) / ctaSize.y); resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
} }
else else
{ {

@ -46,29 +46,27 @@
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
namespace optflowbm namespace optflowbm
{ {
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp); __device__ int cmpBlocks(cv::cudev::TexturePtr<uchar> texCurr, cv::cudev::TexturePtr<uchar> texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize)
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
__device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
{ {
int s = 0; int s = 0;
for (int y = 0; y < blockSize.y; ++y) for (int y = 0; y < blockSize.y; ++y)
{ {
for (int x = 0; x < blockSize.x; ++x) for (int x = 0; x < blockSize.x; ++x)
s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y)); s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x));
} }
return s; return s;
} }
__global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious, __global__ void calcOptFlowBM(cv::cudev::TexturePtr<uchar> texPrev, cv::cudev::TexturePtr<uchar> texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
const int maxX, const int maxY, const int acceptLevel, const int escapeLevel, const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
const short2* ss, const int ssCount) const short2* ss, const int ssCount)
{ {
@ -90,7 +88,7 @@ namespace optflowbm
int dist = numeric_limits<int>::max(); int dist = numeric_limits<int>::max();
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
dist = cmpBlocks(X1, Y1, X2, Y2, blockSize); dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
int countMin = 1; int countMin = 1;
int sumx = offX; int sumx = offX;
@ -111,7 +109,7 @@ namespace optflowbm
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
{ {
const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize); const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
if (tmpDist < acceptLevel) if (tmpDist < acceptLevel)
{ {
sumx = dx; sumx = dx;
@ -151,16 +149,12 @@ namespace optflowbm
void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious, void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream) int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
{ {
bindTexture(&tex_prev, prev); cv::cudev::Texture<uchar> texPrev(prev);
bindTexture(&tex_curr, curr); cv::cudev::Texture<uchar> texCurr(curr);
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y)); const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
calcOptFlowBM<<<grid, block, 0, stream>>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel, escapeLevel, ss, ssCount);
calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
maxX, maxY, acceptLevel, escapeLevel, ss, ssCount);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }

@ -100,7 +100,8 @@ bool TestHypothesesGrow::process()
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
ncvAssertReturn(this->src.fill(h_vecSrc), false); ncvAssertReturn(this->src.fill(h_vecSrc), false);
memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
*h_vecDst.ptr() = {};
NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u)); NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
ncvAssertReturn(h_vecDst_as32u.isMemReused(), false); ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
ncvAssertReturn(this->src.fill(h_vecDst_as32u), false); ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);

@ -46,6 +46,7 @@
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/warp_shuffle.hpp" #include "opencv2/core/cuda/warp_shuffle.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -825,64 +826,57 @@ namespace cv { namespace cuda { namespace device
//------------------------------------------------------------------- //-------------------------------------------------------------------
// Resize // Resize
texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex; __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar, float> src, float sx, float sy, PtrStepSz<uchar> dst)
texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
{ {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255; dst.ptr(y)[x] = src(y * sy, x * sx) * 255;
} }
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs) __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar4, float4> src, float sx, float sy, PtrStepSz<uchar4> dst)
{ {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
{ {
float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy); float4 val = src(y * sy, x * sx);
dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
} }
} }
template<class T, class TEX> static void resize_for_hog_8UC1(const PtrStepSzb& src, PtrStepSzb dst)
static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
{ {
tex.filterMode = cudaFilterModeLinear; cv::cudev::Texture<uchar,float> tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
size_t texOfs = 0;
int colOfs = 0;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
if (texOfs != 0)
{
colOfs = static_cast<int>( texOfs/sizeof(T) );
cudaSafeCall( cudaUnbindTexture(tex) );
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
}
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y)); dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols; float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows; float sy = static_cast<float>(src.rows) / dst.rows;
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs); resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar>)dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
}
static void resize_for_hog_8UC4(const PtrStepSzb& src, PtrStepSzb dst)
{
cv::cudev::Texture<uchar4, float4> tex(src.rows, src.cols, reinterpret_cast<uchar4*>(src.data), src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows;
cudaSafeCall( cudaUnbindTexture(tex) ); resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar4>)dst);
cudaSafeCall(cudaGetLastError());
cudaSafeCall(cudaDeviceSynchronize());
} }
void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); } void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC1(src, dst); }
void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); } void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC4(src, dst); }
} // namespace hog } // namespace hog
}}} // namespace cv { namespace cuda { namespace cudev }}} // namespace cv { namespace cuda { namespace cudev

@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
*/ */
//============== caltech hog tests =====================// //============== caltech hog tests =====================//
struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> > struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string, bool>>
{ {
cv::cuda::DeviceInfo devInfo; cv::cuda::DeviceInfo devInfo;
cv::Mat img; cv::Mat img;
@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std
devInfo = GET_PARAM(0); devInfo = GET_PARAM(0);
cv::cuda::setDevice(devInfo.deviceID()); cv::cuda::setDevice(devInfo.deviceID());
img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE); const bool grayScale = GET_PARAM(2);
if(grayScale)
img = readImage(GET_PARAM(1), IMREAD_GRAYSCALE);
else {
Mat imgBgr = readImage(GET_PARAM(1));
cv::cvtColor(imgBgr, img, COLOR_BGR2BGRA);
}
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
} }
}; };
@ -263,10 +269,11 @@ CUDA_TEST_P(CalTech, HOG)
#endif #endif
} }
#define GREYSCALE true, false
INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES, INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png", ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
"caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png", "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
"caltech/image_00000527_0.png", "caltech/image_00000574_0.png"))); "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE)));
//------------------------variable GPU HOG Tests------------------------// //------------------------variable GPU HOG Tests------------------------//

@ -50,8 +50,7 @@
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/filters.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/border_interpolate.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include <iostream>
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
@ -64,224 +63,6 @@ namespace pyrlk
__constant__ int c_halfWin_y; __constant__ int c_halfWin_y;
__constant__ int c_iters; __constant__ int c_iters;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn, typename T> struct Tex_I
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_I8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
{
bindTexture(&tex_I8U, I);
}
};
template <> struct Tex_I<1, ushort>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, int>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, float>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_If, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
{
bindTexture(&tex_If, I);
}
};
// ****************** 3 channel specializations ************************
template <> struct Tex_I<3, uchar>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0,0,0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, ushort>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, int>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, float>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
{
CV_UNUSED(I);
}
};
// ****************** 4 channel specializations ************************
template <> struct Tex_I<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
{
bindTexture(&tex_I8UC4, I);
}
};
template <> struct Tex_I<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
{
bindTexture(&tex_I16UC4, I);
}
};
template <> struct Tex_I<4, float>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_If4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
{
bindTexture(&tex_If4, I);
}
};
// ************* J ***************
template <int cn, typename T> struct Tex_J
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
{
CV_UNUSED(J);
}
};
template <> struct Tex_J<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_J8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
{
bindTexture(&tex_J8U, J);
}
};
template <> struct Tex_J<1, float>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_Jf, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
{
bindTexture(&tex_Jf, J);
}
};
// ************* 4 channel specializations ***************
template <> struct Tex_J<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
{
bindTexture(&tex_J8UC4, J);
}
};
template <> struct Tex_J<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
{
bindTexture(&tex_J16UC4, J);
}
};
template <> struct Tex_J<4, float>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_Jf4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
{
bindTexture(&tex_Jf4, J);
}
};
__device__ __forceinline__ void accum(float& dst, const float& val) __device__ __forceinline__ void accum(float& dst, const float& val)
{ {
dst += val; dst += val;
@ -364,8 +145,8 @@ namespace pyrlk
} }
}; };
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T> template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T, class Ptr2D>
__global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) __global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{ {
#if __CUDA_ARCH__ <= 110 #if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128; const int BLOCK_SIZE = 128;
@ -413,15 +194,14 @@ namespace pyrlk
float x = prevPt.x + xBase + 0.5f; float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f; float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn, T>::read(x, y); I_patch[i][j] = texI(y, x);
// Scharr Deriv // Scharr Deriv
work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) -
(3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1));
work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) - work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) -
(3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1)); (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1));
work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
dIdx_patch[i][j] = dIdx; dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy; dIdy_patch[i][j] = dIdy;
@ -490,7 +270,8 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
work_type diff = (J_val - I_val) * 32.0f; work_type diff = (J_val - I_val) * 32.0f;
@ -533,7 +314,8 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
work_type diff = J_val - I_val; work_type diff = J_val - I_val;
@ -749,6 +531,27 @@ namespace pyrlk
} }
} // __global__ void sparseKernel_ } // __global__ void sparseKernel_
// Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat.
template<int cn, class T> class TextureLinear : public cv::cudev::Texture<typename TypeVec<T, cn>::vec_type, typename TypeVec<float, cn>::vec_type> {
public:
typedef typename TypeVec<T, cn>::vec_type elem_type;
typedef typename TypeVec<float, cn>::vec_type ret_type;
__host__ TextureLinear(PtrStepSz<elem_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
cv::cudev::Texture<elem_type, ret_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat)
{
}
};
// Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType.
template<int cn> class TextureLinear<cn, float> : public cv::cudev::Texture<typename TypeVec<float, cn>::vec_type, typename TypeVec<float, cn>::vec_type>
{
public:
typedef typename TypeVec<float, cn>::vec_type float_type;
__host__ TextureLinear(PtrStepSz<float_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
cv::cudev::Texture <float_type, float_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType)
{
}
};
template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
{ {
@ -756,16 +559,16 @@ namespace pyrlk
static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream) int level, dim3 block, cudaStream_t stream)
{ {
typedef typename TypeVec<T, cn>::vec_type dType;
typedef typename TypeVec<float, cn>::vec_type rType;
TextureLinear<cn,T> texI(I);
TextureLinear<cn,T> texJ(J);
dim3 grid(ptcount); dim3 grid(ptcount);
CV_UNUSED(I);
CV_UNUSED(J);
if (level == 0 && err) if (level == 0 && err)
sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols); sparseKernel<cn, PATCH_X, PATCH_Y, true, T, cv::cudev::TexturePtr<dType,rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
else else
sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols); sparseKernel<cn, PATCH_X, PATCH_Y, false, T, cv::cudev::TexturePtr<dType, rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@ -903,8 +706,8 @@ namespace pyrlk
}; };
template <bool calcErr> template <bool calcErr, class Ptr2D>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) __global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
@ -925,15 +728,15 @@ namespace pyrlk
float x = xBase - c_halfWin_x + j + 0.5f; float x = xBase - c_halfWin_x + j + 0.5f;
float y = yBase - c_halfWin_y + i + 0.5f; float y = yBase - c_halfWin_y + i + 0.5f;
I_patch[i * patchWidth + j] = tex2D(tex_If, x, y); I_patch[i * patchWidth + j] = texI(y, x);
// Scharr Deriv // Scharr Deriv
dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) - dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) -
(3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1)); (3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1));
dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) - dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) -
(3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1)); (3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1));
} }
} }
@ -1004,7 +807,7 @@ namespace pyrlk
for (int j = 0; j < c_winSize_x; ++j) for (int j = 0; j < c_winSize_x; ++j)
{ {
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
int diff = (J - I) * 32; int diff = (J - I) * 32;
@ -1040,7 +843,8 @@ namespace pyrlk
for (int j = 0; j < c_winSize_x; ++j) for (int j = 0; j < c_winSize_x; ++j)
{ {
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
errval += ::abs(J - I); errval += ::abs(J - I);
} }
@ -1109,9 +913,6 @@ namespace pyrlk
{ sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call } { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
}; };
Tex_I<cn, T>::bindTexture_(I);
Tex_J<cn, T>::bindTexture_(J);
funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount, funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream); level, block, stream);
} }
@ -1119,9 +920,8 @@ namespace pyrlk
{ {
dim3 block(16, 16); dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
Tex_I<1, T>::bindTexture_(I); TextureLinear<1, T> texI(I);
Tex_J<1, T>::bindTexture_(J); TextureLinear<1, T> texJ(J);
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x; const int patchWidth = block.x + 2 * halfWin.x;
const int patchHeight = block.y + 2 * halfWin.y; const int patchHeight = block.y + 2 * halfWin.y;
@ -1129,12 +929,12 @@ namespace pyrlk
if (err.data) if (err.data)
{ {
denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols); denseKernel<true, cv::cudev::TexturePtr<T,float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols);
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
} }
else else
{ {
denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); denseKernel<false, cv::cudev::TexturePtr<T, float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
} }

@ -46,6 +46,7 @@
#include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda.hpp" #include "opencv2/core/cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
@ -102,63 +103,8 @@ namespace tvl1flow
} }
} }
struct SrcTex
{
virtual ~SrcTex() {}
__device__ __forceinline__ virtual float I1(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
};
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1y, x, y);
}
};
struct SrcTexObj : SrcTex
{
__host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
: tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
__device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1y, x, y);
}
cudaTextureObject_t tex_obj_I1;
cudaTextureObject_t tex_obj_I1x;
cudaTextureObject_t tex_obj_I1y;
};
template <
typename T,
typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
>
__global__ void warpBackwardKernel( __global__ void warpBackwardKernel(
const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2, const PtrStepSzf I0, const cv::cudev::TexturePtr<float> I1, const cv::cudev::TexturePtr<float> I1x, const cv::cudev::TexturePtr<float> I1y, const PtrStepf u1, const PtrStepf u2,
PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -189,11 +135,9 @@ namespace tvl1flow
for (int cx = xmin; cx <= xmax; ++cx) for (int cx = xmin; cx <= xmax; ++cx)
{ {
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
sum += w * I1(cy, cx);
sum += w * src.I1(cx, cy); sumx += w * I1x(cy, cx);
sumx += w * src.I1x(cx, cy); sumy += w * I1y(cy, cx);
sumy += w * src.I1y(cx, cy);
wsum += w; wsum += w;
} }
} }
@ -224,49 +168,14 @@ namespace tvl1flow
PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
cudaStream_t stream) cudaStream_t stream)
{ {
cv::cudev::Texture<float> texI1(I1);
cv::cudev::Texture<float> texI1x(I1x);
cv::cudev::Texture<float> texI1y(I1y);
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
warpBackwardKernel<<<grid, block, 0, stream>>>(I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho);
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
if (cc30)
{
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
else
cudaSafeCall(cudaStreamSynchronize(stream));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
}
else
{
bindTexture(&tex_I1, I1);
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
}
} }
} }

@ -43,8 +43,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/common.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include <limits.h> #include <limits.h>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
namespace stereobm namespace stereobm
@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device
/////////////////////////////////// Textureness filtering //////////////////////////////////////// /////////////////////////////////// Textureness filtering ////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF; __device__ __forceinline__ float sobel(cv::cudev::TexturePtr<uchar, float> texSrc, int x, int y)
__device__ __forceinline__ float sobel(int x, int y)
{ {
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) + float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) + texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1); texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1);
return fabs(conv); return fabs(conv);
} }
@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device
#define RpT (2 * ROWSperTHREAD) // got experimentally #define RpT (2 * ROWSperTHREAD) // got experimentally
__global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold) __global__ void textureness_kernel(cv::cudev::TexturePtr<uchar,float> texSrc, PtrStepSzb disp, int winsz, float threshold)
{ {
int winsz2 = winsz/2; int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2; int n_dirty_pixels = (winsz2) * 2;
@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device
for(int i = y - winsz2; i <= y + winsz2; ++i) for(int i = y - winsz2; i <= y + winsz2; ++i)
{ {
sum += sobel(x - winsz2, i); sum += sobel(texSrc, x - winsz2, i);
if (cols_extra) if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i); sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i);
} }
*cols = sum; *cols = sum;
if (cols_extra) if (cols_extra)
@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device
for(int y = beg_row + 1; y < end_row; ++y) for(int y = beg_row + 1; y < end_row; ++y)
{ {
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2); sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2);
*cols = sum; *cols = sum;
if (cols_extra) if (cols_extra)
{ {
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2); sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra; *cols_extra = sum_extra;
} }
@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream) void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
{ {
avgTexturenessThreshold *= winsz * winsz; avgTexturenessThreshold *= winsz * winsz;
cv::cudev::Texture<unsigned char, float> tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat);
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1); dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x); grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT); grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold); textureness_kernel<<<grid, threads, smem_size, stream>>>(tex, disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
} }
} // namespace stereobm } // namespace stereobm
}}} // namespace cv { namespace cuda { namespace cudev }}} // namespace cv { namespace cuda { namespace cudev

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -77,8 +78,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd); BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc); Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst); remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -98,8 +99,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd); BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc); Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device
} }
}; };
#define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \ template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStreamTex
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ {
struct tex_remap_ ## type ## _reader \ static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
{ \ PtrStepSz< T > dst, const float* borderValue, bool cc20)
typedef type elem_type; \ {
typedef int index_type; \ typedef typename TypeVec<float, VecTraits< T >::cn>::vec_type work_type;
int xoff, yoff; \ dim3 block(32, cc20 ? 8 : 4);
tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
{ \ {
return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \ cudev::Texture<T> texSrcWhole(srcWhole);
} \ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
}; \ BorderReader<cudev::TexturePtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \ Filter<BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
{ \ remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort) }
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2) else {
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4) cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block >>>(filter_src, mapx, mapy, dst);
}
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short) cudaSafeCall( cudaGetLastError() );
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2) cudaSafeCall( cudaDeviceSynchronize() );
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4) }
};
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int) template <template <typename> class Filter, typename T> struct RemapDispatcherNonStreamTex<Filter, BrdReplicate, T>
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2) {
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4) static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
PtrStepSz< T > dst, const float*, bool)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
{
cudev::Texture<T> texSrcWhole(srcWhole);
Filter<cudev::TexturePtr<T>> filter_src(texSrcWhole);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
}
else
{
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar> :
RemapDispatcherNonStreamTex<Filter, B, uchar> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar4> :
RemapDispatcherNonStreamTex<Filter, B, uchar4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort> :
RemapDispatcherNonStreamTex<Filter, B, ushort> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort4> :
RemapDispatcherNonStreamTex<Filter, B, ushort4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short> :
RemapDispatcherNonStreamTex<Filter, B, short> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short4> :
RemapDispatcherNonStreamTex<Filter, B, short4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float> :
RemapDispatcherNonStreamTex<Filter, B, float> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float4> :
RemapDispatcherNonStreamTex<Filter, B, float4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, short> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, short4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, float> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, float4> {};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{ {
@ -234,37 +243,23 @@ namespace cv { namespace cuda { namespace device
} }
}; };
callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap, callers[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff, xmap, ymap,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20); static_cast<PtrStepSz<T>>(dst), borderValue, stream, cc20);
} }
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc } // namespace imgproc

@ -49,6 +49,7 @@
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -105,7 +106,7 @@ namespace cv { namespace cuda { namespace device
} }
} }
template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx) template <class Ptr2D, typename T> __global__ void resize(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
{ {
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
@ -130,54 +131,6 @@ namespace cv { namespace cuda { namespace device
} }
} }
// textures
template <typename T> struct TextureAccessor;
#define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
template <> struct TextureAccessor<type> \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff; \
int yoff; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_##type, x + xoff, y + yoff); \
} \
__host__ static void bind(const PtrStepSz<type>& mat) \
{ \
bindTexture(&tex_resize_##type, mat); \
} \
};
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
template <typename T>
TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
{
TextureAccessor<T>::bind(mat);
TextureAccessor<T> t;
t.xoff = xoff;
t.yoff = yoff;
return t;
}
// callers for nearest interpolation // callers for nearest interpolation
template <typename T> template <typename T>
@ -194,14 +147,19 @@ namespace cv { namespace cuda { namespace device
} }
template <typename T> template <typename T>
void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx) void call_resize_nearest_tex(const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
{ {
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (xoff || yoff) {
resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx); cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
resize<cudev::TextureOffPtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
}
else {
cudev::Texture<T> texSrcWhole(srcWhole);
resize<cudev::TexturePtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
}
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
@ -225,27 +183,21 @@ namespace cv { namespace cuda { namespace device
{ {
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.data == src.data) if (srcWhole.data == src.data)
{ {
TextureAccessor<T> texSrc = texAccessor(src, 0, 0); cudev::Texture<T> texSrc(src);
LinearFilter< TextureAccessor<T> > filteredSrc(texSrc); LinearFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx); resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
} }
else else
{ {
TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff); cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols); BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd); BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc); LinearFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx); resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
@ -258,8 +210,8 @@ namespace cv { namespace cuda { namespace device
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols); BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd); BorderReader<PtrStep<T>, BrdReplicate<T>> brdSrc(src, brd);
CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc); CubicFilter<BorderReader< PtrStep<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx); resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -273,27 +225,21 @@ namespace cv { namespace cuda { namespace device
{ {
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.data == src.data) if (srcWhole.data == src.data)
{ {
TextureAccessor<T> texSrc = texAccessor(src, 0, 0); cudev::Texture<T> texSrc(src);
CubicFilter< TextureAccessor<T> > filteredSrc(texSrc); CubicFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx); resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
} }
else else
{ {
TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff); cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols); BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd); BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc); CubicFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx); resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
@ -318,7 +264,7 @@ namespace cv { namespace cuda { namespace device
if (fx > 1 || fy > 1) if (fx > 1 || fy > 1)
call_resize_nearest_glob(src, dst, fy, fx, 0); call_resize_nearest_glob(src, dst, fy, fx, 0);
else else
call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx); call_resize_nearest_tex(srcWhole, yoff, xoff, dst, fy, fx);
} }
} }
}; };
@ -389,7 +335,7 @@ namespace cv { namespace cuda { namespace device
{ {
if (stream) if (stream)
call_resize_cubic_glob(src, dst, fy, fx, stream); call_resize_cubic_glob(src, dst, fy, fx, stream);
else else
call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx); call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
} }
}; };
@ -421,16 +367,16 @@ namespace cv { namespace cuda { namespace device
if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN) if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
{ {
BrdConstant<T> brd(src.rows, src.cols); BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd); BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy); IntegerAreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst); resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
} }
else else
{ {
BrdConstant<T> brd(src.rows, src.cols); BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd); BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy); AreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst); resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
} }

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -164,8 +165,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd); BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc); Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat); warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -186,8 +187,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd); BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc); Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -196,86 +197,48 @@ namespace cv { namespace cuda { namespace device
} }
}; };
#define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \ template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStreamTex
texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ {
struct tex_warp_ ## type ## _reader \ static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20)
{ \ {
typedef type elem_type; \ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
typedef int index_type; \ dim3 block(32, cc20 ? 8 : 4);
int xoff, yoff; \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \ if (xoff || yoff) {
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
{ \ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \ BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
} \ Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
}; \ warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \ }
{ \ else {
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \ cudev::Texture<T> texSrcWhole(srcWhole);
{ \ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \ BorderReader<cudev::TexturePtr<T>, B<work_type>>brdSrc(texSrcWhole, brd);
dim3 block(32, cc20 ? 8 : 4); \ Filter< BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
bindTexture(&tex_warp_ ## type , srcWhole); \ }
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \ cudaSafeCall( cudaGetLastError() );
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \ cudaSafeCall( cudaDeviceSynchronize() );
BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \ }
Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \ };
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
#undef OPENCV_CUDA_IMPLEMENT_WARP_TEX template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar> :
WarpDispatcherNonStreamTex<Transform, Filter, B, uchar> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, uchar4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort> :
WarpDispatcherNonStreamTex<Transform, Filter, B, ushort> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, ushort4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short> :
WarpDispatcherNonStreamTex<Transform, Filter, B, short> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, short4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float> :
WarpDispatcherNonStreamTex<Transform, Filter, B, float> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, float4> {};
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{ {
@ -319,8 +282,8 @@ namespace cv { namespace cuda { namespace device
} }
}; };
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, funcs[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff,
static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20); static_cast<PtrStepSz<T>>(dst), borderValue, warpMat, stream, cc20);
} }
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
@ -330,32 +293,18 @@ namespace cv { namespace cuda { namespace device
} }
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -366,32 +315,18 @@ namespace cv { namespace cuda { namespace device
} }
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc } // namespace imgproc

@ -42,6 +42,8 @@
#ifndef __OPENCV_TEST_PRECOMP_HPP__ #ifndef __OPENCV_TEST_PRECOMP_HPP__
#define __OPENCV_TEST_PRECOMP_HPP__ #define __OPENCV_TEST_PRECOMP_HPP__
#include <thread>
#include "opencv2/ts.hpp" #include "opencv2/ts.hpp"
#include "opencv2/ts/cuda_test.hpp" #include "opencv2/ts/cuda_test.hpp"

@ -206,6 +206,60 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)), testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
WHOLE_SUBMAT)); WHOLE_SUBMAT));
PARAM_TEST_CASE(ResizeTextures, cv::cuda::DeviceInfo, Interpolation)
{
cv::cuda::DeviceInfo devInfo;
Interpolation interpolation;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
interpolation = GET_PARAM(1);
cv::cuda::setDevice(devInfo.deviceID());
}
};
void ResizeThread(const Interpolation interp, const GpuMat& imgIn, const std::vector<GpuMat>& imgsOut, Stream& stream) {
for (auto& imgOut : imgsOut)
cv::cuda::resize(imgIn, imgOut, imgOut.size(), 0, 0, interp, stream);
}
CUDA_TEST_P(ResizeTextures, Accuracy)
{
constexpr int nThreads = 5;
constexpr int nIters = 5;
const Size szIn(100, 100);
const Size szOut(200, 200);
vector<Stream> streams(nThreads, cv::cuda::Stream::Null());
vector<GpuMat> imgsIn;
vector<vector<GpuMat>> imgsOut;
for (int i = 0; i < nThreads; i++) {
imgsIn.push_back(GpuMat(szIn, CV_8UC1, i));
vector<GpuMat> imgsOutPerThread;
for (int j = 0; j < nIters; j++)
imgsOutPerThread.push_back(GpuMat(szOut, CV_8UC1));
imgsOut.push_back(imgsOutPerThread);
}
vector<std::thread> thread(nThreads);
for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(ResizeThread, interpolation, std::ref(imgsIn.at(i)), std::ref(imgsOut.at(i)), std::ref(streams.at(i)));
for (int i = 0; i < nThreads; i++) thread.at(i).join();
for (int i = 0; i < nThreads; i++) {
GpuMat imgOutGs;
cv::cuda::resize(imgsIn.at(i), imgOutGs, szOut, 0, 0, interpolation, streams.at(i));
Mat imgOutGsHost; imgOutGs.download(imgOutGsHost);
for (const auto& imgOut : imgsOut.at(i)) {
Mat imgOutHost; imgOut.download(imgOutHost);
ASSERT_TRUE(cv::norm(imgOutHost, imgOutGsHost, NORM_INF) == 0);
}
}
}
INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
ALL_DEVICES,
testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
}} // namespace }} // namespace
#endif // HAVE_CUDA #endif // HAVE_CUDA

@ -1,147 +1,159 @@
/*M/////////////////////////////////////////////////////////////////////////////////////// // This file is part of OpenCV project.
// // It is subject to the license terms in the LICENSE file found in the top-level directory
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // of this distribution and at http://opencv.org/license.html.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma once
#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
#define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
#include <cstring>
#include "../common.hpp"
#include "glob.hpp"
#include "gpumat.hpp"
#include "traits.hpp"
#if CUDART_VERSION >= 5050
namespace
{
template <typename T> struct CvCudevTextureRef
{
typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
static TexRef ref; #ifndef OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
#define OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
__host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat, #include <opencv2/core.hpp>
bool normalizedCoords = false, #include <opencv2/core/utils/logger.hpp>
cudaTextureFilterMode filterMode = cudaFilterModePoint, #include <opencv2/core/cuda_types.hpp>
cudaTextureAddressMode addressMode = cudaAddressModeClamp) #include <opencv2/cudev/common.hpp>
{ #include <opencv2/cudev/ptr2d/traits.hpp>
ref.normalized = normalizedCoords;
ref.filterMode = filterMode;
ref.addressMode[0] = addressMode;
ref.addressMode[1] = addressMode;
ref.addressMode[2] = addressMode;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); /** \file texture.hpp
*/
CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) ); namespace cv { namespace cudev {
//! @addtogroup cudev
//! @{
/** @brief Simple lightweight structures that encapsulate information about an image texture on the device.
* They are intended to be passed to nvcc-compiled code.
*/
template<class T, class R = T>
struct TexturePtr {
typedef R elem_type, value_type;
typedef float index_type;
__host__ TexturePtr() {};
__host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
return tex2D<R>(tex, x, y);
}
__device__ __forceinline__ R operator ()(index_type x) const {
return tex1Dfetch<R>(tex, x);
} }
private:
cudaTextureObject_t tex;
};
__host__ static void unbind() // textures are a maximum of 32 bits wide, 64 bits is read as two 32 bit wide values
{ template <class R>
cudaUnbindTexture(ref); struct TexturePtr<uint64, R> {
typedef float index_type;
__host__ TexturePtr() {};
__host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
const uint2 retVal = tex2D<uint2>(tex, x, y);
return *(reinterpret_cast<const R*>(&retVal));
}
__device__ __forceinline__ R operator ()(index_type x) const {
const uint2 retVal = tex1Dfetch<uint2>(tex, x);
return *(reinterpret_cast<const R*>(&retVal));
} }
private:
cudaTextureObject_t tex;
}; };
template <typename T> template<class T, class R = T>
typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref; struct TextureOffPtr {
} typedef R elem_type;
typedef float index_type;
__host__ TextureOffPtr(const cudaTextureObject_t tex_, const int yoff_, const int xoff_) : tex(tex_), yoff(yoff_), xoff(xoff_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
return tex2D<R>(tex, x + xoff, y + yoff);
}
private:
cudaTextureObject_t tex;
int xoff = 0;
int yoff = 0;
};
#endif /** @brief non-copyable smart CUDA texture object
*
* UniqueTexture is a smart non-sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class UniqueTexture {
public:
__host__ UniqueTexture() noexcept { }
__host__ UniqueTexture(UniqueTexture&) = delete;
__host__ UniqueTexture(UniqueTexture&& other) noexcept {
tex = other.tex;
other.tex = 0;
}
namespace cv { namespace cudev { __host__ UniqueTexture(const int rows, const int cols, T* data, const size_t step, const bool normalizedCoords = false,
const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
const cudaTextureReadMode readMode = cudaReadModeElementType)
{
create(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode);
}
//! @addtogroup cudev __host__ UniqueTexture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
//! @{ const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType)
{
create(1, static_cast<int>(sizeInBytes/sizeof(T)), data, sizeInBytes, normalizedCoords, filterMode, addressMode, readMode);
}
#if CUDART_VERSION >= 5050 __host__ ~UniqueTexture() {
if (tex != cudaTextureObject_t()) {
try {
CV_CUDEV_SAFE_CALL(cudaDestroyTextureObject(tex));
}
catch (const cv::Exception& ex) {
std::ostringstream os;
os << "Exception caught during CUDA texture object destruction.\n";
os << ex.what();
os << "Exception will be ignored.\n";
CV_LOG_WARNING(0, os.str().c_str());
}
}
template <typename T> struct TexturePtr }
{
typedef T value_type;
typedef float index_type;
cudaTextureObject_t texObj; __host__ UniqueTexture& operator=(const UniqueTexture&) = delete;
__host__ UniqueTexture& operator=(UniqueTexture&& other) noexcept {
CV_Assert(other);
if (&other != this) {
UniqueTexture(std::move(*this)); /* destroy current texture object */
tex = other.tex;
other.tex = cudaTextureObject_t();
}
return *this;
}
__device__ __forceinline__ T operator ()(float y, float x) const __host__ cudaTextureObject_t get() const noexcept {
{ CV_Assert(tex);
#if CV_CUDEV_ARCH < 300 return tex;
// Use the texture reference }
return tex2D(CvCudevTextureRef<T>::ref, x, y);
#else __host__ explicit operator bool() const noexcept { return tex != cudaTextureObject_t(); }
// Use the texture object
return tex2D<T>(texObj, x, y);
#endif
}
};
template <typename T> struct Texture : TexturePtr<T>
{
int rows, cols;
bool cc30;
__host__ explicit Texture(const GlobPtrSz<T>& mat,
bool normalizedCoords = false,
cudaTextureFilterMode filterMode = cudaFilterModePoint,
cudaTextureAddressMode addressMode = cudaAddressModeClamp)
{
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
rows = mat.rows; private:
cols = mat.cols;
if (cc30) template <class T1>
__host__ void create(const int rows, const int cols, T1* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
{ {
// Use the texture object
cudaResourceDesc texRes; cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes)); std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypePitch2D; if (rows == 1) {
texRes.res.pitch2D.devPtr = mat.data; CV_Assert(rows == 1 && cols*sizeof(T) == step);
texRes.res.pitch2D.height = mat.rows; texRes.resType = cudaResourceTypeLinear;
texRes.res.pitch2D.width = mat.cols; texRes.res.linear.devPtr = data;
texRes.res.pitch2D.pitchInBytes = mat.step; texRes.res.linear.sizeInBytes = step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>(); texRes.res.linear.desc = cudaCreateChannelDesc<T1>();
}
else {
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = data;
texRes.res.pitch2D.height = rows;
texRes.res.pitch2D.width = cols;
texRes.res.pitch2D.pitchInBytes = step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T1>();
}
cudaTextureDesc texDescr; cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr)); std::memset(&texDescr, 0, sizeof(texDescr));
@ -150,109 +162,112 @@ template <typename T> struct Texture : TexturePtr<T>
texDescr.addressMode[0] = addressMode; texDescr.addressMode[0] = addressMode;
texDescr.addressMode[1] = addressMode; texDescr.addressMode[1] = addressMode;
texDescr.addressMode[2] = addressMode; texDescr.addressMode[2] = addressMode;
texDescr.readMode = cudaReadModeElementType; texDescr.readMode = readMode;
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) ); CV_CUDEV_SAFE_CALL(cudaCreateTextureObject(&tex, &texRes, &texDescr, 0));
} }
else
__host__ void create(const int rows, const int cols, uint64* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
{ {
// Use the texture reference create<uint2>(rows, cols, (uint2*)data, step, normalizedCoords, filterMode, addressMode, readMode);
CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
} }
}
__host__ ~Texture() private:
{ cudaTextureObject_t tex;
if (cc30) };
/** @brief sharable smart CUDA texture object
*
* Texture is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class Texture {
public:
Texture() = default;
Texture(const Texture&) = default;
Texture(Texture&&) = default;
__host__ Texture(const int rows_, const int cols_, T* data, const size_t step, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
rows(rows_), cols(cols_), texture(std::make_shared<UniqueTexture<T,R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode))
{ {
// Use the texture object
cudaDestroyTextureObject(this->texObj);
} }
else
__host__ Texture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
rows(1), cols(static_cast<int>(sizeInBytes/sizeof(T))), texture(std::make_shared<UniqueTexture<T, R>>(sizeInBytes, data, normalizedCoords, filterMode, addressMode, readMode))
{ {
// Use the texture reference
CvCudevTextureRef<T>::unbind();
} }
}
};
template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> > __host__ Texture(PtrStepSz<T> src, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
{ const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
}; Texture(src.rows, src.cols, src.data, src.step, normalizedCoords, filterMode, addressMode, readMode)
{
}
#else Texture& operator=(const Texture&) = default;
Texture& operator=(Texture&&) = default;
template <typename T> struct TexturePtr __host__ explicit operator bool() const noexcept {
{ if (!texture)
typedef T value_type; return false;
typedef float index_type; return texture->operator bool();
}
cudaTextureObject_t texObj; __host__ operator TexturePtr<T, R>() const {
if (texture)
return TexturePtr<T, R>(texture->get());
else
return TexturePtr<T, R>(cudaTextureObject_t());
}
__device__ __forceinline__ T operator ()(float y, float x) const int rows = 0;
{ int cols = 0;
#if CV_CUDEV_ARCH >= 300
// Use the texture object protected:
return tex2D<T>(texObj, x, y); std::shared_ptr<UniqueTexture<T, R>> texture = 0;
#else };
CV_UNUSED(y);
CV_UNUSED(x); template <typename T, typename R> struct PtrTraits<Texture<T, R>> : PtrTraitsBase<Texture<T, R>, TexturePtr<T, R>>
return T();
#endif
}
};
template <typename T> struct Texture : TexturePtr<T>
{
int rows, cols;
__host__ explicit Texture(const GlobPtrSz<T>& mat,
bool normalizedCoords = false,
cudaTextureFilterMode filterMode = cudaFilterModePoint,
cudaTextureAddressMode addressMode = cudaAddressModeClamp)
{
CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
rows = mat.rows;
cols = mat.cols;
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = mat.data;
texRes.res.pitch2D.height = mat.rows;
texRes.res.pitch2D.width = mat.cols;
texRes.res.pitch2D.pitchInBytes = mat.step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
texDescr.normalizedCoords = normalizedCoords;
texDescr.filterMode = filterMode;
texDescr.addressMode[0] = addressMode;
texDescr.addressMode[1] = addressMode;
texDescr.addressMode[2] = addressMode;
texDescr.readMode = cudaReadModeElementType;
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
}
__host__ ~Texture()
{ {
// Use the texture object };
cudaDestroyTextureObject(this->texObj);
}
};
template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
{
};
#endif /** @brief sharable smart CUDA texture object with offset
* TextureOff is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class TextureOff {
public:
TextureOff(const TextureOff&) = default;
TextureOff(TextureOff&&) = default;
__host__ TextureOff(const int rows, const int cols, T* data, const size_t step, const int yoff_ = 0, const int xoff_ = 0, const bool normalizedCoords = false,
const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
const cudaTextureReadMode readMode = cudaReadModeElementType) :
texture(std::make_shared<UniqueTexture<T, R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode)), xoff(xoff_), yoff(yoff_)
{
}
//! @} __host__ TextureOff(PtrStepSz<T> src, const int yoff = 0, const int xoff = 0, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
TextureOff(src.rows, src.cols, src.data, src.step, yoff, xoff, normalizedCoords, filterMode, addressMode, readMode)
{
}
TextureOff& operator=(const TextureOff&) = default;
TextureOff& operator=(TextureOff&&) = default;
__host__ operator TextureOffPtr<T, R>() const {
return TextureOffPtr<T, R>(texture->get(), yoff, xoff);
}
private:
int xoff = 0;
int yoff = 0;
std::shared_ptr<UniqueTexture<T, R>> texture = 0;
};
}} }}
#endif #endif

@ -213,7 +213,7 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
return __hiloint2double(hi, lo); return __hiloint2double(hi, lo);
} }
__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize) __device__ __forceinline__ uint64 shfl_up(uint64 val, uint delta, int width = warpSize)
{ {
return __shfl_up(val, delta, width); return __shfl_up(val, delta, width);
} }

@ -51,6 +51,7 @@
#include "opencv2/core/cuda/utility.hpp" #include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
@ -59,23 +60,19 @@ namespace cv { namespace cuda { namespace device
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold); void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols); void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(PtrStepSzb img); void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
size_t bindSumTex(PtrStepSz<unsigned int> sum);
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayer); int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers); int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter); unsigned int* featureCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures); void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
} }
}}} }}}
@ -121,34 +118,8 @@ namespace cv { namespace cuda { namespace device
cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) ); cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
} }
////////////////////////////////////////////////////////////////////////
// Integral image texture
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
void bindImgTex(PtrStepSzb img)
{
bindTexture(&imgTex, img);
}
size_t bindSumTex(PtrStepSz<uint> sum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
return offset / sizeof(uint);
}
size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
return offset / sizeof(uint);
}
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x) template <int N> __device__ float icvCalcHaarPatternSum(cudev::TexturePtr<unsigned int> texSum, const float src[][5], int oldSize, int newSize, int y, int x)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
typedef double real_t; typedef double real_t;
@ -169,10 +140,10 @@ namespace cv { namespace cuda { namespace device
int dy2 = __float2int_rn(ratio * src[k][3]); int dy2 = __float2int_rn(ratio * src[k][3]);
real_t t = 0; real_t t = 0;
t += tex2D(sumTex, x + dx1, y + dy1); t += texSum(y + dy1, x + dx1);
t -= tex2D(sumTex, x + dx1, y + dy2); t -= texSum(y + dy2, x + dx1);
t -= tex2D(sumTex, x + dx2, y + dy1); t -= texSum(y + dy1, x + dx2);
t += tex2D(sumTex, x + dx2, y + dy2); t += texSum(y + dy2, x + dx2);
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
} }
@ -201,7 +172,7 @@ namespace cv { namespace cuda { namespace device
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
} }
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) __global__ void icvCalcLayerDetAndTrace(cudev::TexturePtr<unsigned int> texSum, PtrStepf det, PtrStepf trace)
{ {
// Determine the indices // Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
@ -222,29 +193,29 @@ namespace cv { namespace cuda { namespace device
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{ {
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave)); const float dx = icvCalcHaarPatternSum<3>(texSum, c_DX , 9, size, (i << c_octave), (j << c_octave));
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave)); const float dy = icvCalcHaarPatternSum<3>(texSum, c_DY , 9, size, (i << c_octave), (j << c_octave));
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave)); const float dxy = icvCalcHaarPatternSum<4>(texSum, c_DXY, 9, size, (i << c_octave), (j << c_octave));
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
} }
} }
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayers) int octave, int nOctaveLayers)
{ {
const int min_size = calcSize(octave, 0); const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
cudev::Texture<unsigned int> texSum(sum);
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid; dim3 grid;
grid.x = divUp(max_samples_j, threads.x); grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace); icvCalcLayerDetAndTrace<<<grid, threads>>>(texSum, det, trace);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
@ -255,10 +226,14 @@ namespace cv { namespace cuda { namespace device
__constant__ float c_DM[5] = {0, 0, 9, 9, 1}; __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
struct WithMask template<bool useMask = true>
struct Mask
{ {
static __device__ bool check(int sum_i, int sum_j, int size) __host__ Mask(){};
__host__ Mask(cudev::TexturePtr<unsigned int> tex_): tex(tex_) {};
__device__ bool check(int sum_i, int sum_j, int size)
{ {
if (!useMask) return true;
float ratio = (float)size / 9.0f; float ratio = (float)size / 9.0f;
float d = 0; float d = 0;
@ -269,19 +244,20 @@ namespace cv { namespace cuda { namespace device
int dy2 = __float2int_rn(ratio * c_DM[3]); int dy2 = __float2int_rn(ratio * c_DM[3]);
float t = 0; float t = 0;
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); t += tex(sum_i + dy1, sum_j + dx1);
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); t -= tex(sum_i + dy2, sum_j + dx1);
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); t -= tex(sum_i + dy1, sum_j + dx2);
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); t += tex(sum_i + dy2, sum_j + dx2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f); return (d >= 0.5f);
} }
cudev::TexturePtr<unsigned int> tex;
}; };
template <typename Mask> template<class T>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, __global__ void icvFindMaximaInLayer(T mask, const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
unsigned int* maxCounter) unsigned int* maxCounter)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
@ -323,7 +299,7 @@ namespace cv { namespace cuda { namespace device
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (Mask::check(sum_i, sum_j, size)) if (mask.check(sum_i, sum_j, size))
{ {
// Check to see if we have a max (in its 26 neighbours) // Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
@ -374,7 +350,7 @@ namespace cv { namespace cuda { namespace device
#endif #endif
} }
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{ {
const int layer_rows = img_rows >> octave; const int layer_rows = img_rows >> octave;
@ -390,10 +366,15 @@ namespace cv { namespace cuda { namespace device
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask) if (use_mask) {
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter); cudev::Texture<unsigned int> texMaskSum(maskSum);
else Mask<true> mask(texMaskSum);
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter); icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
}
else {
Mask<false> mask;
icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
}
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
@ -539,7 +520,7 @@ namespace cv { namespace cuda { namespace device
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir) __global__ void icvCalcOrientation(cudev::TexturePtr<unsigned int> texSum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{ {
__shared__ float s_X[128]; __shared__ float s_X[128];
__shared__ float s_Y[128]; __shared__ float s_Y[128];
@ -576,8 +557,8 @@ namespace cv { namespace cuda { namespace device
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size) x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{ {
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x); X = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x); Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NY, 4, grad_wav_size, y, x);
angle = atan2f(Y, X); angle = atan2f(Y, X);
if (angle < 0) if (angle < 0)
@ -676,8 +657,9 @@ namespace cv { namespace cuda { namespace device
#undef ORI_WIN #undef ORI_WIN
#undef ORI_SAMPLES #undef ORI_SAMPLES
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{ {
cudev::Texture<unsigned int> texSum(sum);
dim3 threads; dim3 threads;
threads.x = 32; threads.x = 32;
threads.y = 4; threads.y = 4;
@ -685,7 +667,7 @@ namespace cv { namespace cuda { namespace device
dim3 grid; dim3 grid;
grid.x = nFeatures; grid.x = nFeatures;
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir); icvCalcOrientation<<<grid, threads>>>(texSum, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
@ -724,12 +706,14 @@ namespace cv { namespace cuda { namespace device
{ {
typedef uchar elem_type; typedef uchar elem_type;
__device__ WinReader(cudev::TexturePtr<uchar> tex_) : tex(tex_) {};
__device__ __forceinline__ uchar operator ()(int i, int j) const __device__ __forceinline__ uchar operator ()(int i, int j) const
{ {
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return tex2D(imgTex, pixel_x, pixel_y); return tex(pixel_y, pixel_x);
} }
float centerX; float centerX;
@ -739,19 +723,17 @@ namespace cv { namespace cuda { namespace device
float sin_dir; float sin_dir;
int width; int width;
int height; int height;
cudev::TexturePtr<uchar> tex;
}; };
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, __device__ void calc_dx_dy(cudev::TexturePtr<uchar> tex, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy);
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy) float& dx, float& dy)
{ {
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1]; __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
dx = dy = 0.0f; dx = dy = 0.0f;
WinReader win; WinReader win(tex);
win.centerX = featureX[blockIdx.x]; win.centerX = featureX[blockIdx.x];
win.centerY = featureY[blockIdx.x]; win.centerY = featureY[blockIdx.x];
@ -813,14 +795,14 @@ namespace cv { namespace cuda { namespace device
} }
} }
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) __global__ void compute_descriptors_64(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{ {
__shared__ float smem[32 * 16]; __shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32; float* sRow = smem + threadIdx.y * 32;
float dx, dy; float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy); calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
float dxabs = ::fabsf(dx); float dxabs = ::fabsf(dx);
float dyabs = ::fabsf(dy); float dyabs = ::fabsf(dy);
@ -839,14 +821,14 @@ namespace cv { namespace cuda { namespace device
*descriptors_block = make_float4(dx, dy, dxabs, dyabs); *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
} }
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) __global__ void compute_descriptors_128(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{ {
__shared__ float smem[32 * 16]; __shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32; float* sRow = smem + threadIdx.y * 32;
float dx, dy; float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy); calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2; float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
@ -925,13 +907,13 @@ namespace cv { namespace cuda { namespace device
descriptor_base[threadIdx.x] = val / s_len; descriptor_base[threadIdx.x] = val / s_len;
} }
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures) void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{ {
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
cudev::Texture<unsigned char> texImg(img);
if (descriptors.cols == 64) if (descriptors.cols == 64)
{ {
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
@ -943,7 +925,7 @@ namespace cv { namespace cuda { namespace device
} }
else else
{ {
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );

@ -94,23 +94,19 @@ namespace cv { namespace cuda { namespace device
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold); void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols); void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(PtrStepSzb img); void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
size_t bindSumTex(PtrStepSz<unsigned int> sum);
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayer); int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers); int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter); unsigned int* featureCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures); void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
} }
}}} }}}
@ -138,10 +134,7 @@ namespace
class SURF_CUDA_Invoker class SURF_CUDA_Invoker
{ {
public: public:
SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img, const GpuMat& mask) : SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img_, const GpuMat& mask) : surf_(surf), img(img_), img_cols(img_.cols), img_rows(img_.rows), use_mask(!mask.empty())
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty())
{ {
CV_Assert(!img.empty() && img.type() == CV_8UC1); CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -167,16 +160,12 @@ namespace
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold)); loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
bindImgTex(img);
cuda::integral(img, surf_.sum); cuda::integral(img, surf_.sum);
sumOffset = bindSumTex(surf_.sum);
if (use_mask) if (use_mask)
{ {
cuda::min(mask, 1.0, surf_.mask1); cuda::min(mask, 1.0, surf_.mask1);
cuda::integral(surf_.mask1, surf_.maskSum); cuda::integral(surf_.mask1, surf_.maskSum);
maskOffset = bindMaskSumTex(surf_.maskSum);
} }
} }
@ -195,9 +184,9 @@ namespace
const int layer_cols = img_cols >> octave; const int layer_cols = img_cols >> octave;
loadOctaveConstants(octave, layer_rows, layer_cols); loadOctaveConstants(octave, layer_rows, layer_cols);
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers); icvCalcLayerDetAndTrace_gpu(surf_.sum, surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave, icvFindMaximaInLayer_gpu(surf_.maskSum, surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers); img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
unsigned int maxCounter; unsigned int maxCounter;
@ -230,7 +219,7 @@ namespace
const int nFeatures = keypoints.cols; const int nFeatures = keypoints.cols;
if (nFeatures > 0) if (nFeatures > 0)
{ {
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW), icvCalcOrientation_gpu(surf_.sum, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures); keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
} }
} }
@ -241,7 +230,7 @@ namespace
if (nFeatures > 0) if (nFeatures > 0)
{ {
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors); ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW), compute_descriptors_gpu(img, descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures); keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
} }
} }
@ -252,6 +241,8 @@ namespace
SURF_CUDA& surf_; SURF_CUDA& surf_;
GpuMat img;
int img_cols, img_rows; int img_cols, img_rows;
bool use_mask; bool use_mask;
@ -259,9 +250,6 @@ namespace
int maxCandidates; int maxCandidates;
int maxFeatures; int maxFeatures;
size_t maskOffset;
size_t sumOffset;
GpuMat counters; GpuMat counters;
}; };
} }

Loading…
Cancel
Save