Merge pull request #3378 from cudawarped:replace_texture_ref_with_texture_obj

Fix CUDA texture bugs and replace all instances of CUDA texture references with texture objects
pull/3397/head
Alexander Smorkalov 2 years ago committed by GitHub
commit 8db3e627fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 84
      modules/cudaarithm/src/cuda/lut.cu
  2. 6
      modules/cudaarithm/src/lut.hpp
  3. 217
      modules/cudaimgproc/src/cuda/canny.cu
  4. 55
      modules/cudaimgproc/src/cuda/corners.cu
  5. 48
      modules/cudaimgproc/src/cuda/debayer.cu
  6. 52
      modules/cudaimgproc/src/cuda/gftt.cu
  7. 25
      modules/cudaimgproc/src/cuda/hough_segments.cu
  8. 39
      modules/cudaimgproc/src/cuda/mean_shift.cu
  9. 24
      modules/cudaimgproc/src/gftt.cpp
  10. 18
      modules/cudaimgproc/test/test_color.cpp
  11. 64
      modules/cudaimgproc/test/test_hough.cpp
  12. 2
      modules/cudaimgproc/test/test_precomp.hpp
  13. 4
      modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
  14. 10
      modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
  15. 363
      modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
  16. 352
      modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
  17. 428
      modules/cudalegacy/src/cuda/NPP_staging.cu
  18. 24
      modules/cudalegacy/src/cuda/bm.cu
  19. 3
      modules/cudalegacy/test/TestHypothesesGrow.cpp
  20. 52
      modules/cudaobjdetect/src/cuda/hog.cu
  21. 13
      modules/cudaobjdetect/test/test_objdetect.cpp
  22. 306
      modules/cudaoptflow/src/cuda/pyrlk.cu
  23. 113
      modules/cudaoptflow/src/cuda/tvl1flow.cu
  24. 39
      modules/cudastereo/src/cuda/stereobm.cu
  25. 189
      modules/cudawarping/src/cuda/remap.cu
  26. 112
      modules/cudawarping/src/cuda/resize.cu
  27. 161
      modules/cudawarping/src/cuda/warp.cu
  28. 2
      modules/cudawarping/test/test_precomp.hpp
  29. 54
      modules/cudawarping/test/test_resize.cpp
  30. 429
      modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
  31. 2
      modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
  32. 140
      modules/xfeatures2d/src/cuda/surf.cu
  33. 34
      modules/xfeatures2d/src/surf.cuda.cpp

@ -53,6 +53,7 @@
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv;
using namespace cv::cuda;
@ -60,8 +61,6 @@ using namespace cv::cudev;
namespace cv { namespace cuda {
texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
@ -73,83 +72,28 @@ namespace cv { namespace cuda {
Mat h_lut = _lut.getMat();
d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
}
CV_Assert( d_lut.depth() == CV_8U );
CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (cc30)
{
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypeLinear;
texRes.res.linear.devPtr = d_lut.data;
texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
}
else
{
// Use the texture reference
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
}
}
LookUpTableImpl::~LookUpTableImpl()
{
if (cc30)
{
// Use the texture object
cudaDestroyTextureObject(texLutTableObj);
}
else
{
// Use the texture reference
cudaUnbindTexture(texLutTable);
}
szInBytes = 256 * d_lut.channels() * sizeof(uchar);
}
struct LutTablePtrC1
{
typedef uchar value_type;
typedef uchar index_type;
cudaTextureObject_t texLutTableObj;
__device__ __forceinline__ uchar operator ()(uchar, uchar x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return tex1Dfetch(texLutTable, x);
#else
// Use the texture object
return tex1Dfetch<uchar>(texLutTableObj, x);
#endif
cv::cudev::TexturePtr<uchar> tex;
__device__ __forceinline__ uchar operator ()(uchar, uchar x) const {
return tex(x);
}
};
struct LutTablePtrC3
{
typedef uchar3 value_type;
typedef uchar3 index_type;
cudaTextureObject_t texLutTableObj;
__device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
#else
// Use the texture object
return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
#endif
cv::cudev::TexturePtr<uchar> tex;
__device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const {
return make_uchar3(tex(x.x * 3), tex(x.y * 3 + 1), tex(x.z * 3 + 2));
}
};
@ -169,20 +113,18 @@ namespace cv { namespace cuda {
{
GpuMat_<uchar> src1(src.reshape(1));
GpuMat_<uchar> dst1(dst.reshape(1));
cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
LutTablePtrC1 tbl;
tbl.texLutTableObj = texLutTableObj;
tbl.tex = TexturePtr<uchar>(tex);
dst1.assign(lut_(src1, tbl), stream);
}
else if (lut_cn == 3)
{
GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
LutTablePtrC3 tbl;
tbl.texLutTableObj = texLutTableObj;
tbl.tex = TexturePtr<uchar>(tex);
dst3.assign(lut_(src3, tbl), stream);
}

@ -15,14 +15,10 @@ class LookUpTableImpl : public LookUpTable
{
public:
LookUpTableImpl(InputArray lut);
~LookUpTableImpl();
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE;
private:
GpuMat d_lut;
cudaTextureObject_t texLutTableObj;
bool cc30;
size_t szInBytes = 0;
};
} }

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda;
using namespace cv::cuda::device;
@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device
namespace canny
{
struct SrcTex
{
virtual ~SrcTex() {}
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
__device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
int xoff;
int yoff;
};
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const override
{
return tex2D(tex_src, x + xoff, y + yoff);
}
};
struct SrcTexObj : SrcTex
{
__host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
__device__ __forceinline__ int operator ()(int y, int x) const override
{
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
}
cudaTextureObject_t tex_src_object;
};
template <
class T,
class Norm,
typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
>
__global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
template <class Norm>
__global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr<uchar> texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -138,8 +100,8 @@ namespace canny
if (y >= mag.rows || x >= mag.cols)
return;
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1));
int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1));
dx(y, x) = dxVal;
dy(y, x) = dyVal;
@ -151,63 +113,20 @@ namespace canny
{
const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (cc30)
cv::cudev::TextureOff<uchar> texSrc(srcWhole, yoff, xoff);
if (L2Grad)
{
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex = 0;
createTextureObjectPitch2D(&tex, srcWhole, texDesc);
SrcTexObj src(xoff, yoff, tex);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
cudaSafeCall( cudaDestroyTextureObject(tex) );
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
}
else
{
bindTexture(&tex_src, srcWhole);
SrcTexRef src(xoff, yoff);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
}
if (stream == NULL)
cudaSafeCall(cudaDeviceSynchronize());
}
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@ -229,8 +148,7 @@ namespace canny
namespace canny
{
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
__global__ void calcMapKernel(cv::cudev::TexturePtr<float> texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
@ -245,7 +163,7 @@ namespace canny
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D(tex_mag, x, y);
const float m = texMag(y, x);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
@ -264,69 +182,17 @@ namespace canny
if (dyVal < tg22x)
{
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
if (m > texMag(y, x - 1) && m >= texMag(y, x + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
if (m > texMag(y - 1, x) && m >= texMag(y + 1, x))
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
}
map(y, x) = edge_type;
}
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
return;
int dxVal = dx(y, x);
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D<float>(tex_mag, x, y);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
dyVal <<= CANNY_SHIFT;
if (dyVal < tg22x)
{
if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s))
edge_type = 1 + (int)(m > high_thresh);
}
}
@ -338,47 +204,10 @@ namespace canny
{
const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
if (deviceSupports(FEATURE_SET_COMPUTE_30))
{
// Use the texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = mag.ptr();
resDesc.res.pitch2D.height = mag.rows;
resDesc.res.pitch2D.width = mag.cols;
resDesc.res.pitch2D.pitchInBytes = mag.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex=0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
cudaSafeCall( cudaDestroyTextureObject(tex) );
}
else
{
// Use the texture reference
bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
}
cv::cudev::Texture<float> texMag(mag);
calcMapKernel<<<grid, block, 0, stream>>>(texMag, dx, dy, map, low_thresh, high_thresh);
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
}
}

@ -47,6 +47,7 @@
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include "opencv2/opencv_modules.hpp"
@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device
{
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
__global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device
{
for (int j = jbegin; j < jend; ++j)
{
float dx = tex2D(harrisDxTex, j, i);
float dy = tex2D(harrisDyTex, j, i);
float dx = texDx(i, j);
float dy = texDy(i, j);
a += dx * dx;
b += dx * dy;
@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device
}
template <typename BR, typename BC>
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
__global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device
{
const int x = border_row.idx_col(j);
float dx = tex2D(harrisDxTex, x, y);
float dy = tex2D(harrisDyTex, x, y);
float dx = texDx(y, x);
float dy = texDy(y, x);
a += dx * dx;
b += dx * dy;
@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device
{
dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
bindTexture(&harrisDxTex, Dx);
bindTexture(&harrisDyTex, Dy);
cv::cudev::Texture<float> texDx(Dx);
cv::cudev::Texture<float> texDy(Dy);
switch (border_type)
{
case BORDER_REFLECT101:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break;
case BORDER_REFLECT:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break;
case BORDER_REPLICATE:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst);
break;
}
@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device
/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
__global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device
{
for (int j = jbegin; j < jend; ++j)
{
float dx = tex2D(minEigenValDxTex, j, i);
float dy = tex2D(minEigenValDyTex, j, i);
float dx = texMinEigenValDx(i, j);
float dy = texMinEigenValDy(i, j);
a += dx * dx;
b += dx * dy;
@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device
template <typename BR, typename BC>
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
__global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device
{
int x = border_row.idx_col(j);
float dx = tex2D(minEigenValDxTex, x, y);
float dy = tex2D(minEigenValDyTex, x, y);
float dx = texMinEigenValDx(y, x);
float dy = texMinEigenValDy(y, x);
a += dx * dx;
b += dx * dy;
@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device
{
dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
bindTexture(&minEigenValDxTex, Dx);
bindTexture(&minEigenValDyTex, Dy);
cv::cudev::Texture<float> texMinEigenValDx(Dx);
cv::cudev::Texture<float> texMinEigenValDy(Dy);
switch (border_type)
{
case BORDER_REFLECT101:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break;
case BORDER_REFLECT:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break;
case BORDER_REPLICATE:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst);
break;
}

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/color.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/cudev/ptr2d/texture.hpp"
namespace cv { namespace cuda { namespace device
{
@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device
//
// ported to CUDA
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
template <typename DstType>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
template <typename DstType, class Ptr2D>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
{
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device
return;
int2 center;
center.x = x + sourceOffset.x;
center.y = y + sourceOffset.y;
center.x = x;
center.y = y;
int4 xCoord;
xCoord.x = center.x - 2;
@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device
yCoord.z = center.y + 1;
yCoord.w = center.y + 2;
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
float C = src(center.y, center.x); // ( 0, 0)
float4 Dvec;
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1)
Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1)
Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1)
Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1)
float4 value;
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
value.x = src(yCoord.x, center.x); // ( 0,-2) A0
value.y = src(yCoord.y, center.x); // ( 0,-1) B0
value.z = src(center.y, xCoord.x); // (-2, 0) E0
value.w = src(center.y, xCoord.y); // (-1, 0) F0
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
value.x += src(yCoord.w, center.x); // ( 0, 2) A1
value.y += src(yCoord.z, center.x); // ( 0, 1) B1
value.z += src(center.y, xCoord.w); // ( 2, 0) E1
value.w += src(center.y, xCoord.z); // ( 1, 0) F1
float4 PATTERN;
PATTERN.x = kCx * C;
@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
bindTexture(&sourceTex, src);
if (sourceOffset.x || sourceOffset.y) {
cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
}
else {
cv::cudev::Texture<uchar> texSrc(src);
MHCdemosaic<dst_t, cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
}
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)

@ -45,36 +45,36 @@
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include <thrust/execution_policy.h>
namespace cv { namespace cuda { namespace device
{
namespace gfft
{
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter)
template <class Mask> __global__ void findCorners(cv::cudev::TexturePtr<float> tex, float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, int *g_counter)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
{
float val = tex2D<float>(eigTex, j, i);
float val = tex(i, j);
if (val > threshold)
{
float maxVal = val;
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i - 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal);
maxVal = ::fmax(tex(i - 1, j - 1), maxVal);
maxVal = ::fmax(tex(i - 1, j), maxVal);
maxVal = ::fmax(tex(i - 1, j + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal);
maxVal = ::fmax(tex(i, j - 1), maxVal);
maxVal = ::fmax(tex(i, j + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j , i + 1), maxVal);
maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal);
maxVal = ::fmax(tex(i + 1, j - 1), maxVal);
maxVal = ::fmax(tex(i + 1, j), maxVal);
maxVal = ::fmax(tex(i + 1, j + 1), maxVal);
if (val == maxVal)
{
@ -87,17 +87,18 @@ namespace cv { namespace cuda { namespace device
}
}
int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
{
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
cv::cudev::Texture<float> tex(eig);
dim3 block(16, 16);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
if (mask.data)
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counterPtr);
findCorners<<<grid, block, 0, stream>>>(tex, threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols, counterPtr);
else
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counterPtr);
findCorners<<<grid, block, 0, stream>>>(tex, threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols, counterPtr);
cudaSafeCall( cudaGetLastError() );
@ -113,27 +114,24 @@ namespace cv { namespace cuda { namespace device
class EigGreater
{
public:
EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_)
{
}
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
{
return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
EigGreater(cv::cudev::TexturePtr<float> tex_) : tex(tex_) {}
__device__ __forceinline__ bool operator()(float2 a, float2 b) const{
return tex(a.y, a.x) > tex(b.y, b.x);
}
cudaTextureObject_t eigTex;
cv::cudev::TexturePtr<float> tex;
};
void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream)
void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
{
cv::cudev::Texture<float> tex(eig);
thrust::device_ptr<float2> ptr(corners);
#if THRUST_VERSION >= 100802
if (stream)
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex));
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(tex));
else
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex));
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(tex));
#else
thrust::sort(ptr, ptr + count, EigGreater(eigTex));
thrust::sort(ptr, ptr + count, EigGreater(tex));
#endif
}
} // namespace optical_flow

@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device
{
namespace hough_segments
{
__global__ void houghLinesProbabilistic(cv::cudev::Texture<uchar> src, const PtrStepSzi accum,
template<class Ptr2D>
__global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum,
int4* out, const int maxSize,
const float rho, const float theta,
const int lineGap, const int lineLength,
@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
cv::cudev::GpuMat_<uchar> src_(mask);
cv::cudev::Texture<uchar> tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp);
houghLinesProbabilistic<<<grid, block, 0, stream>>>(tex, accum,
out, maxSize,
rho, theta,
lineGap, lineLength,
mask.rows, mask.cols,
counterPtr);
Size wholeSize;
Point ofs;
mask.locateROI(wholeSize, ofs);
if (ofs.x || ofs.y) {
cv::cudev::TextureOff<uchar> texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x);
houghLinesProbabilistic<cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
}
else {
cv::cudev::Texture<uchar> texMask(mask);
houghLinesProbabilistic<cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
}
cudaSafeCall( cudaGetLastError() );
int totalCount;
@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device
cudaSafeCall( cudaStreamSynchronize(stream) );
totalCount = ::min(totalCount, maxSize);
return totalCount;
}
}

@ -47,19 +47,16 @@
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
size_t out_step, int cols, int rows,
int sp, int sr, int maxIter, float eps)
__device__ short2 do_mean_shift(cv::cudev::TexturePtr<uchar4> tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps)
{
int isr2 = sr*sr;
uchar4 c = tex2D(tex_meanshift, x0, y0 );
uchar4 c = tex(y0, x0);
// iterate meanshift procedure
for( int iter = 0; iter < maxIter; iter++ )
@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device
int rowCount = 0;
for( int x = minx; x <= maxx; x++ )
{
uchar4 t = tex2D( tex_meanshift, x, y );
uchar4 t = tex(y, x);
int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
if( norm2 <= isr2 )
@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device
return make_short2((short)x0, (short)y0);
}
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
__global__ void meanshift_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
if( x0 < cols && y0 < rows )
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
}
void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device
dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
unsigned char* outsp, size_t outspstep,
int cols, int rows,
int sp, int sr, int maxIter, float eps)
__global__ void meanshiftproc_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep,
int cols, int rows,int sp, int sr, int maxIter, float eps)
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device
if( x0 < cols && y0 < rows )
{
int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
*(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
}
}
@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device
dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}

@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
{
namespace gfft
{
int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream);
int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
}
}}}
@ -120,31 +120,15 @@ namespace
cudaStream_t stream_ = StreamAccessor::getStream(stream);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
//create texture object for findCorners_gpu and sortCorners_gpu
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModePoint;
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t eigTex_;
PtrStepSzf eig = eig_;
cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);
int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
if (total == 0)
{
_corners.release();
cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
return;
}
sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_);
cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
if (minDistance_ < 1)
{

@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Demosaicing
struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
struct Demosaicing : testing::TestWithParam<testing::tuple<cv::cuda::DeviceInfo, bool>>
{
cv::cuda::DeviceInfo devInfo;
bool useRoi;
virtual void SetUp()
{
devInfo = GetParam();
devInfo = GET_PARAM(0);
useRoi = GET_PARAM(1);
cv::cuda::setDevice(devInfo.deviceID());
}
@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
mosaic(img, src, cv::Point(1, 1));
cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
mosaic(img, src, cv::Point(0, 1));
cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
mosaic(img, src, cv::Point(0, 0));
cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
mosaic(img, src, cv::Point(1, 0));
cv::cuda::GpuMat dst;
cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES);
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// swapChannels

@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughLines Probabilistic
PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi)
{
cv::cuda::DeviceInfo devInfo;
bool useRoi;
Size size;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
useRoi = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID());
}
static void generateLines(cv::Mat& img)
{
img.setTo(cv::Scalar::all(0));
@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
{
const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
cv::cuda::setDevice(devInfo.deviceID());
const cv::Size size = GET_PARAM(1);
const bool useRoi = GET_PARAM(2);
const float rho = 1.0f;
const float theta = (float) (1.0 * CV_PI / 180.0);
const int minLineLength = 15;
@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
}
void HoughLinesProbabilisticThread(const Ptr<HoughSegmentDetector> detector, const GpuMat& imgIn, const std::vector<GpuMat>& linesOut, Stream& stream) {
for (auto& lines : linesOut)
detector->detect(imgIn, lines, stream);
stream.waitForCompletion();
}
CUDA_TEST_P(HoughLinesProbabilistic, Async)
{
constexpr int nThreads = 5;
constexpr int nIters = 5;
vector<Stream> streams(nThreads); // async test only
vector<GpuMat> imgsIn;
vector<Ptr<HoughSegmentDetector>> detectors;
vector<vector<GpuMat>> linesOut(nThreads);
const float rho = 1.0f;
const float theta = (float)(1.0 * CV_PI / 180.0);
const int minLineLength = 15;
const int maxLineGap = 8;
cv::Mat src(size, CV_8UC1);
generateLines(src);
for (int i = 0; i < nThreads; i++) {
imgsIn.push_back(loadMat(src, useRoi));
detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap));
linesOut.push_back(vector<GpuMat>(nIters));
}
vector<std::thread> thread(nThreads);
for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i)));
for (int i = 0; i < nThreads; i++) thread.at(i).join();
for (int i = 0; i < nThreads; i++) {
std::vector<cv::Vec4i> linesSegment;
std::vector<cv::Vec2f> lines;
for (const auto& line : linesOut.at(i)) {
line.download(linesSegment);
cv::Mat dst(size, CV_8UC1);
drawLines(dst, linesSegment);
ASSERT_MAT_NEAR(src, dst, 0.0);
}
}
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughCircles

@ -49,4 +49,6 @@
#include "cvconfig.h"
#include <thread>
#endif

@ -119,9 +119,9 @@ typedef bool NcvBool;
typedef long long Ncv64s;
#if defined(__APPLE__) && !defined(__CUDACC__)
typedef uint64_t Ncv64u;
typedef uint64 Ncv64u;
#else
typedef unsigned long long Ncv64u;
typedef uint64 Ncv64u;
#endif
typedef int Ncv32s;

@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
* \return NCV status code
*/
CV_EXPORTS
NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier);
@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
* \return NCV status code
*/
CV_EXPORTS
NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier);
@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
* \return NCV status code
*/
CV_EXPORTS
NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
NcvRect32u srcROI,

@ -65,9 +65,12 @@
#include "opencv2/cudalegacy/NPP_staging.hpp"
#include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
typedef NCVVectorAlloc<Ncv32f> FloatVector;
typedef cv::cudev::TexturePtr<float> Ptr2D;
typedef cv::cudev::Texture<float> Texture;
/////////////////////////////////////////////////////////////////////////////////////////
// Implementation specific constants
@ -84,39 +87,6 @@ inline int iDivUp(int a, int b)
return (a + b - 1)/b;
}
/////////////////////////////////////////////////////////////////////////////////////////
// Texture references
/////////////////////////////////////////////////////////////////////////////////////////
texture<float, 2, cudaReadModeElementType> tex_coarse;
texture<float, 2, cudaReadModeElementType> tex_fine;
texture<float, 2, cudaReadModeElementType> tex_I1;
texture<float, 2, cudaReadModeElementType> tex_I0;
texture<float, 2, cudaReadModeElementType> tex_Ix;
texture<float, 2, cudaReadModeElementType> tex_Ixx;
texture<float, 2, cudaReadModeElementType> tex_Ix0;
texture<float, 2, cudaReadModeElementType> tex_Iy;
texture<float, 2, cudaReadModeElementType> tex_Iyy;
texture<float, 2, cudaReadModeElementType> tex_Iy0;
texture<float, 2, cudaReadModeElementType> tex_Ixy;
texture<float, 1, cudaReadModeElementType> tex_u;
texture<float, 1, cudaReadModeElementType> tex_v;
texture<float, 1, cudaReadModeElementType> tex_du;
texture<float, 1, cudaReadModeElementType> tex_dv;
texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
texture<float, 1, cudaReadModeElementType> tex_numerator_u;
texture<float, 1, cudaReadModeElementType> tex_numerator_v;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
/////////////////////////////////////////////////////////////////////////////////////////
// SUPPLEMENTARY FUNCTIONS
/////////////////////////////////////////////////////////////////////////////////////////
@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
///\param h number of rows in global memory array
///\param p global memory array pitch in floats
///////////////////////////////////////////////////////////////////////////////
template<int tex_id>
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
__forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p)
{
//position within shared memory array
const int ijs = js * PSOR_PITCH + is;
@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
j = max(j, -j-1);
j = min(j, h-j+h-1);
const int pos = j * p + i;
switch(tex_id){
case 0:
smem[ijs] = tex1Dfetch(tex_u, pos);
break;
case 1:
smem[ijs] = tex1Dfetch(tex_v, pos);
break;
case 2:
smem[ijs] = tex1Dfetch(tex_du, pos);
break;
case 3:
smem[ijs] = tex1Dfetch(tex_dv, pos);
break;
}
smem[ijs] = texSrc(pos);
}
///////////////////////////////////////////////////////////////////////////////
@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
///\param h number of rows in global memory array
///\param p global memory array pitch in floats
///////////////////////////////////////////////////////////////////////////////
template<int tex>
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
__forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p)
{
const int i = threadIdx.x + 2;
const int j = threadIdx.y + 2;
load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel
__syncthreads();
if(threadIdx.y < 2)
{
//load bottom shadow elements
load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p);
if(threadIdx.x < 2)
{
//load bottom right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load bottom left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
{
//load upper shadow elements
load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p);
if(threadIdx.x < 2)
{
//load upper right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load upper left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
else
@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
if(threadIdx.x < 2)
{
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
__syncthreads();
@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
/// \param alpha (in) alpha in Brox model (flow smoothness)
/// \param gamma (in) gamma in Brox model (edge importance)
///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
float *denominator_u, float *denominator_v,
float *numerator_dudv,
float *numerator_u, float *numerator_v,
int w, int h, int s,
float alpha, float gamma)
__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy,
Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v,
int w, int h, int s, float alpha, float gamma)
{
__shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
__shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
float x = (float)ig + 0.5f;
float y = (float)jg + 0.5f;
//load u and v to smem
load_array<0>(u, ig, jg, w, h, s);
load_array<1>(v, ig, jg, w, h, s);
load_array<2>(du, ig, jg, w, h, s);
load_array<3>(dv, ig, jg, w, h, s);
load_array(texU, u, ig, jg, w, h, s);
load_array(texV, v, ig, jg, w, h, s);
load_array(texDu, du, ig, jg, w, h, s);
load_array(texDv, dv, ig, jg, w, h, s);
//warped position
float wx = (x + u[ijs])/(float)w;
float wy = (y + v[ijs])/(float)h;
x /= (float)w;
y /= (float)h;
//compute image derivatives
const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
const float Ix = tex2D(tex_Ix, wx, wy);
const float Ixz = Ix - tex2D(tex_Ix0, x, y);
const float Ixy = tex2D(tex_Ixy, wx, wy);
const float Ixx = tex2D(tex_Ixx, wx, wy);
const float Iy = tex2D(tex_Iy, wx, wy);
const float Iyz = Iy - tex2D(tex_Iy0, x, y);
const float Iyy = tex2D(tex_Iyy, wx, wy);
const float Iz = texI1(wy, wx) - texI0(y,x);
const float Ix = texIx(wy, wx);
const float Ixz = Ix - texIx0(y, x);
const float Ixy = texIxy(wy, wx);
const float Ixx = texIxx(wy, wx);
const float Iy = texIy(wy, wx);
const float Iyz = Iy - texIy0(y, x);
const float Iyy = texIyy(wy, wx);
//compute data term
float q0, q1, q2;
q0 = Iz + Ix * du[ijs] + Iy * dv[ijs];
@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
///\param h
///\param s
///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
int w, int h, int s)
__global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s)
{
__shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
__shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
}
if(inside)
{
sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
sx[ijs] = texDiffX(ijg);
sy[ijs] = texDiffY(ijg);
}
else
{
@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
if(j == PSOR_TILE_HEIGHT-1)
{
if(jg < h-1 && inside)
{
sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
}
sy[up] = texDiffY(ijg + s);
else
{
sy[up] = 0.0f;
}
}
int right = ijs + 1;
if(threadIdx.x == PSOR_TILE_WIDTH-1)
{
if(ig < w-1 && inside)
{
sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
}
sx[right] = texDiffX(ijg + 1);
else
{
sx[right] = 0.0f;
}
}
__syncthreads();
float diffusivity_sum;
@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
// Red-Black SOR
/////////////////////////////////////////////////////////////////////////////////////////
template<int isBlack> __global__ void sor_pass(float *new_du,
float *new_dv,
const float *g_inv_denominator_u,
const float *g_inv_denominator_v,
const float *g_numerator_u,
const float *g_numerator_v,
const float *g_numerator_dudv,
float omega,
int width,
int height,
int stride)
template<int isBlack> __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u,
const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
@ -560,14 +493,14 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
//load smooth term
float s_up, s_left, s_right, s_down;
s_left = tex1Dfetch(tex_diffusivity_x, pos);
s_down = tex1Dfetch(tex_diffusivity_y, pos);
s_left = texDiffX(pos);
s_down = texDiffY(pos);
if(i < width-1)
s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
s_right = texDiffX(pos_r);
else
s_right = 0.0f; //Neumann BC
if(j < height-1)
s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
s_up = texDiffY(pos_u);
else
s_up = 0.0f; //Neumann BC
@ -577,30 +510,29 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
float du_up, du_left, du_right, du_down, du;
float dv_up, dv_left, dv_right, dv_down, dv;
u_left = tex1Dfetch(tex_u, pos_l);
u_right = tex1Dfetch(tex_u, pos_r);
u_down = tex1Dfetch(tex_u, pos_d);
u_up = tex1Dfetch(tex_u, pos_u);
u = tex1Dfetch(tex_u, pos);
v_left = tex1Dfetch(tex_v, pos_l);
v_right = tex1Dfetch(tex_v, pos_r);
v_down = tex1Dfetch(tex_v, pos_d);
v = tex1Dfetch(tex_v, pos);
v_up = tex1Dfetch(tex_v, pos_u);
du = tex1Dfetch(tex_du, pos);
du_left = tex1Dfetch(tex_du, pos_l);
du_right = tex1Dfetch(tex_du, pos_r);
du_down = tex1Dfetch(tex_du, pos_d);
du_up = tex1Dfetch(tex_du, pos_u);
dv = tex1Dfetch(tex_dv, pos);
dv_left = tex1Dfetch(tex_dv, pos_l);
dv_right = tex1Dfetch(tex_dv, pos_r);
dv_down = tex1Dfetch(tex_dv, pos_d);
dv_up = tex1Dfetch(tex_dv, pos_u);
u_left = texU(pos_l);
u_right = texU(pos_r);
u_down = texU(pos_d);
u_up = texU(pos_u);
u = texU(pos);
v_left = texV(pos_l);
v_right = texV(pos_r);
v_down = texV(pos_d);
v = texV(pos);
v_up = texV(pos_u);
du = texDu(pos);
du_left = texDu(pos_l);
du_right = texDu(pos_r);
du_down = texDu(pos_d);
du_up = texDu(pos_u);
dv = texDv(pos);
dv_left = texDv(pos_l);
dv_right = texDv(pos_r);
dv_down = texDv(pos_d);
dv_up = texDv(pos_u);
float numerator_dudv = g_numerator_dudv[pos];
if((i+j)%2 == isBlack)
@ -624,52 +556,6 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
///////////////////////////////////////////////////////////////////////////////
// utility functions
///////////////////////////////////////////////////////////////////////////////
void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
}
void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeMirror;
tex.addressMode[1] = cudaAddressModeMirror;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
}
void InitTextures()
{
initTexture2D(tex_I0);
initTexture2D(tex_I1);
initTexture2D(tex_fine); // for downsampling
initTexture2D(tex_coarse); // for prolongation
initTexture2D(tex_Ix);
initTexture2D(tex_Ixx);
initTexture2D(tex_Ix0);
initTexture2D(tex_Iy);
initTexture2D(tex_Iyy);
initTexture2D(tex_Iy0);
initTexture2D(tex_Ixy);
initTexture1D(tex_u);
initTexture1D(tex_v);
initTexture1D(tex_du);
initTexture1D(tex_dv);
initTexture1D(tex_diffusivity_x);
initTexture1D(tex_diffusivity_y);
initTexture1D(tex_inv_denominator_u);
initTexture1D(tex_inv_denominator_v);
initTexture1D(tex_numerator_dudv);
initTexture1D(tex_numerator_u);
initTexture1D(tex_numerator_v);
}
namespace
{
struct ImagePyramid
@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
InitTextures();
}
//prepare image pyramid
@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
//select images with lowest resolution
size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
FloatVector* ptrU = &u;
@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
//texture format descriptor
cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
I0 = *img0Iter;
I1 = *img1Iter;
++img0Iter;
++img1Iter;
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
//compute derivatives
dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes);
Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes);
// flow
ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes);
Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes);
// flow increments
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes);
Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes);
Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes);
Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes);
dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
{
//compute coefficients
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
(diffusivity_x.ptr(),
diffusivity_y.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_dudv.ptr(),
num_u.ptr(),
num_v.ptr(),
kLevelWidth,
kLevelHeight,
kLevelStride,
alpha,
gamma);
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy,
diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
// linear system coefficients
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
//solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{
float omega = 1.99f;
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
(du_new.ptr(),
dv_new.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(),
num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
(du.ptr(),
dv.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(),
num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
}//end of solver loop
}// end of inner loop

@ -72,6 +72,7 @@
#include "opencv2/cudalegacy/NCV.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp"
#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include "NCVRuntimeTemplates.hpp"
#include "NCVAlg.hpp"
@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
#define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
/** \internal
* Haar features solid array.
*/
texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
/** \internal
* Haar classifiers flattened trees container.
* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
* Drawback: breaks tree locality (might cause more cache misses
* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
*/
texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
{
return d_Stages[iStage];
@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
template <NcvBool tbCacheTextureCascade>
__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
__device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
{
HaarClassifierNode128 tmpNode;
if (tbCacheTextureCascade)
{
tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
}
tmpNode._ui4 = texHaarClassifierNodes(iNode);
else
{
tmpNode = d_ClassifierNodes[iNode];
}
return tmpNode;
}
template <NcvBool tbCacheTextureCascade>
__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
Ncv32f *weight,
Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
__device__ void getFeature(cv::cudev::TexturePtr<uint2> texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight)
{
HaarFeature64 feature;
if (tbCacheTextureCascade)
{
feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
}
feature._ui2 = texHaarFeatures(iFeature);
else
{
feature = d_Features[iFeature];
}
feature.getRect(rectX, rectY, rectWidth, rectHeight);
*weight = feature.getWeight();
}
template <NcvBool tbCacheTextureIImg>
__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
__device__ Ncv32u getElemIImg(cv::cudev::TexturePtr<Ncv32u> texImg, Ncv32u x, Ncv32u *d_IImg)
{
if (tbCacheTextureIImg)
{
return tex1Dfetch(texIImage, x);
}
return texImg(x);
else
{
return d_IImg[x];
}
}
@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
}
template <NcvBool tbInitMaskPositively,
NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbReadPixelIndexFromVector,
NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
template <NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes,
Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask,
Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
Ncv32u y_offs;
Ncv32u x_offs;
@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
{
while (bMoreNodesToTraverse)
{
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
{
Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade>
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
}
template <NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
template <NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
while (bMoreNodesToTraverse)
{
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
{
Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade>
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor
{
dim3 gridConf, blockConf;
cudaStream_t cuStream;
//Kernel arguments are stored as members;
cv::cudev::TexturePtr<Ncv32u> texImg;
cv::cudev::TexturePtr<uint2> texHaarFeatures;
cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
Ncv32u *d_IImg;
Ncv32u IImgStride;
Ncv32f *d_weights;
@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor
Ncv32f scaleArea;
//Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
Ncv32u *_d_IImg, Ncv32u _IImgStride,
Ncv32f *_d_weights, Ncv32u _weightsStride,
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
Ncv32u *_d_inMask, Ncv32u *_d_outMask,
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, dim3 _gridConf,
dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes,
HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride),
d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
{}
template<class TList>
@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor
Loki::TL::TypeAt<TList, 2>::Result::value,
Loki::TL::TypeAt<TList, 3>::Result::value,
Loki::TL::TypeAt<TList, 4>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>>
(d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
<<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
}
};
void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbReadPixelIndexFromVector,
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor
{
dim3 gridConf, blockConf;
cudaStream_t cuStream;
//Kernel arguments are stored as members;
cv::cudev::TexturePtr<Ncv32u> texImg;
cv::cudev::TexturePtr<uint2> texHaarFeatures;
cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
Ncv32u *d_IImg;
Ncv32u IImgStride;
Ncv32f *d_weights;
@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor
Ncv32f scaleArea;
//Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
Ncv32u *_d_IImg, Ncv32u _IImgStride,
Ncv32f *_d_weights, Ncv32u _weightsStride,
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
Ncv32u *_d_inMask, Ncv32u *_d_outMask,
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_,
cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features,
HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi,
Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_),
texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
{}
template<class TList>
@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor
Loki::TL::TypeAt<TList, 0>::Result::value,
Loki::TL::TypeAt<TList, 1>::Result::value,
Loki::TL::TypeAt<TList, 2>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>>
(d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
<<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
}
};
void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights,
Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features,
d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
NCV_SKIP_COND_BEGIN
cv::cudev::Texture<Ncv32u> texImg;
if (bTexCacheIImg)
{
cudaChannelFormatDesc cfdTexIImage;
cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
texImg = cv::cudev::Texture<Ncv32u>((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr());
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
(anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
}
if (bTexCacheCascade)
{
cudaChannelFormatDesc cfdTexHaarFeatures;
cudaChannelFormatDesc cfdTexHaarClassifierNodes;
cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
cv::cudev::Texture<uint2> texHaarFeatures;
cv::cudev::Texture<uint4> texHaarClassifierNodes;
if (bTexCacheCascade) {
texHaarFeatures = cv::cudev::Texture<uint2>(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast<uint2*>(d_HaarFeatures.ptr()));
texHaarClassifierNodes = cv::cudev::Texture<uint4>(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast<uint4*>(d_HaarNodes.ptr()));
}
Ncv32u stageStartAnchorParallel = 0;
@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
anchorsRoi.height);
dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate(
true, //tbInitMaskPositively
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid1,
block1,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
0,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream,
texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
}
dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate(
false, //tbInitMaskPositively
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid2,
block2,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction,
grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(),
d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
}
dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
applyHaarClassifierClassifierParallelDynTemplate(
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
bDoAtomicCompaction, //tbDoAtomicCompaction
grid3,
block3,
cuStream,
integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
stageMiddleSwitch,
stageEndClassifierParallel,
scaleAreaPixels);
applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(),
d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections,
d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)

@ -48,12 +48,7 @@
#include "opencv2/cudev.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp"
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
texture<uint2, 1, cudaReadModeElementType> tex64u;
#include <opencv2/cudev/ptr2d/texture.hpp>
//==============================================================================
//
@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void)
}
cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
{
cudaStream_t tmp = nppStream;
@ -117,25 +111,25 @@ private:
template<class T>
inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
inline __device__ T readElem(cv::cudev::TexturePtr<Ncv8u> tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
template<>
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv8u readElem<Ncv8u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{
return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs);
}
template<>
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32u readElem<Ncv32u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{
return d_src[curElemOffs];
}
template<>
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32f readElem<Ncv32f>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{
return d_src[curElemOffs];
}
@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u
* \return None
*/
template <class T_in, class T_out, bool tbDoSqr>
__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
T_out *d_II, Ncv32u IIstride)
__global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride)
{
//advance pointers to the current line
if (sizeof(T_in) != 1)
@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
if (curElemOffs < srcWidth)
{
//load elements
curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
}
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
@ -224,25 +217,9 @@ template <bool tbDoSqr, class T_in, class T_out>
NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
{
cudaChannelFormatDesc cfdTex;
size_t alignmentOffset = 0;
if (sizeof(T_in) == 1)
{
cfdTex = cudaCreateChannelDesc<Ncv8u>();
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
if (alignmentOffset > 0)
{
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
}
}
scanRows
<T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
cv::cudev::Texture<Ncv8u> tex8u(static_cast<size_t>(roi.height * srcStride), (Ncv8u*)d_src);
scanRows <T_in, T_out, tbDoSqr> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
template<class T, NcvBool tbCacheTexture>
__device__ T getElem_Decimate(Ncv32u x, T *d_src);
template<>
__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
{
return tex1Dfetch(tex32u, x);
}
template<>
__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
{
return d_src[x];
}
template<>
__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
{
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
res <<= 32;
res |= tmp.x;
return res;
}
template<>
__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
template <class T>
__global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
{
return d_src[x];
int curX = blockIdx.x * blockDim.x + threadIdx.x;
int curY = blockIdx.y * blockDim.y + threadIdx.y;
if (curX >= dstRoi.width || curY >= dstRoi.height) return;
d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale];
}
template <class T, NcvBool tbCacheTexture>
__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
NcvSize32u dstRoi, Ncv32u scale)
template <class T>
__global__ void decimate_C1R(cv::cudev::TexturePtr<T> texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep,
NcvSize32u dstRoi, Ncv32u scale)
{
int curX = blockIdx.x * blockDim.x + threadIdx.x;
int curY = blockIdx.y * blockDim.y + threadIdx.y;
if (curX >= dstRoi.width || curY >= dstRoi.height)
{
return;
}
d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
if (curX >= dstRoi.width || curY >= dstRoi.height) return;
d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale);
}
template <class T>
static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
T *d_dst, Ncv32u dstStep,
@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
(dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
if (!readThruTexture)
{
decimate_C1R
<T, false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
if (!readThruTexture) {
decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
}
else
{
cudaChannelFormatDesc cfdTexSrc;
if (sizeof(T) == sizeof(Ncv32u))
{
cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
}
else
{
cfdTexSrc = cudaCreateChannelDesc<uint2>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
}
decimate_C1R
<T, true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
else {
cv::cudev::Texture<T> texSrc(srcRoi.height * srcStep * sizeof(T), d_src);
decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSrc, srcStep, d_dst, dstStep, dstRoi, scale);
}
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
implementNppDecimate(32, u)
implementNppDecimate(32, s)
implementNppDecimate(32, f)
implementNppDecimate(64, u)
implementNppDecimate(64, s)
implementNppDecimate(64, f)
implementNppDecimateHost(32, u)
implementNppDecimateHost(32, s)
implementNppDecimateHost(32, f)
@ -776,43 +688,29 @@ implementNppDecimateHost(64, f)
const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
template <NcvBool tbCacheTexture>
__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
template <NcvBool tbCacheTexture, class Ptr2D>
__device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum)
{
if (tbCacheTexture)
{
return tex1Dfetch(tex32u, x);
}
return tex(x);
else
{
return d_sum[x];
}
}
template <NcvBool tbCacheTexture>
__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
template <NcvBool tbCacheTexture, class Ptr2D>
__device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum)
{
if (tbCacheTexture)
{
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
res <<= 32;
res |= tmp.x;
return res;
}
return tex(x);
else
{
return d_sqsum[x];
}
}
template <NcvBool tbCacheTexture>
__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
Ncv64u *d_sqsum, Ncv32u sqsumStep,
Ncv32f *d_norm, Ncv32u normStep,
NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
__global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr<Ncv32u> texSum, cv::cudev::TexturePtr<Ncv64u> texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep,
Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
{
Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
if (x_offs >= roi.width)
@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
//OPT: try swapping order (could change cache hit/miss ratio)
Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_tl = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum);
Ncv32u sum_bl = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
Ncv32u sum_tr = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_br = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
sqsum_tl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
sqsum_bl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
sqsum_tr = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
sqsum_br = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
Ncv32f mean = sum_val * invRectArea;
@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
dim3 block(NUM_RECTSTDDEV_THREADS);
cv::cudev::Texture<Ncv32u> texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum);
cv::cudev::Texture<Ncv64u> texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum);
if (!readThruTexture)
{
rectStdDev_32f_C1R
<false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
}
rectStdDev_32f_C1R<false><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
else
{
cudaChannelFormatDesc cfdTexSrc;
cudaChannelFormatDesc cfdTexSqr;
cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
cfdTexSqr = cudaCreateChannelDesc<uint2>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
rectStdDev_32f_C1R
<true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
}
rectStdDev_32f_C1R<true><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
//
//==============================================================================
texture <float, 1, cudaReadModeElementType> texSrc;
texture <float, 1, cudaReadModeElementType> texKernel;
__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
int i,
int w)
__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w)
{
if (i < 0) i = 1 - i;
if (i >= w) i = w + w - i - 1;
return tex1Dfetch (texSrc, rowOffset + i);
return tex(rowOffset + i);
}
__forceinline__ __device__ float getValueMirrorColumn(const int offset,
const int rowStep,
int j,
int h)
__forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h)
{
if (j < 0) j = 1 - j;
if (j >= h) j = h + h - j - 1;
return tex1Dfetch (texSrc, offset + j * rowStep);
return tex(offset + j * rowStep);
}
__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
__global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
{
// position within ROI
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m)
{
sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
* tex1Dfetch (texKernel, m);
sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width)
* texKernel1(m);
}
pDst[iy * dstStep + ix] = sum * multiplier;
}
__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
__global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m)
{
sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
* tex1Dfetch (texKernel, m);
sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height)
* texKernel(m);
}
pDst[ix + iy * dstStep] = sum * multiplier;
}
NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
oROI.height = srcSize.height - oROI.y;
}
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
texSrc.normalized = false;
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderWrap:
return NPPST_ERROR;
case nppStBorderMirror:
FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break;
default:
@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
}
NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
oROI.height = srcSize.height - oROI.y;
}
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
texSrc.normalized = false;
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
case nppStBorderWrap:
return NPPST_ERROR;
case nppStBorderMirror:
FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
break;
default:
@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
return (num + denom - 1)/denom;
}
texture<float, 2, cudaReadModeElementType> tex_src1;
texture<float, 2, cudaReadModeElementType> tex_src0;
__global__ void BlendFramesKernel(const float *u, const float *v, // forward flow
const float *ur, const float *vr, // backward flow
const float *o0, const float *o1, // coverage masks
int w, int h, int s,
float theta, float *out)
__global__ void BlendFramesKernel(cv::cudev::TexturePtr<Ncv32f> texSrc0, cv::cudev::TexturePtr<Ncv32f> texSrc1,
const float *u, const float *v, // forward flow
const float *ur, const float *vr, // backward flow
const float *o0, const float *o1, // coverage masks
int w, int h, int s, float theta, float *out)
{
const int ix = threadIdx.x + blockDim.x * blockIdx.x;
const int iy = threadIdx.y + blockDim.y * blockIdx.y;
@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v, // forward f
bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f;
if (b0 && b1)
{
// pixel is visible on both frames
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
}
else if (b0)
{
// visible on the first frame only
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
}
else
{
// visible on the second frame only
out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
}
if (b0 && b1) // pixel is visible on both frames
out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta;
else if (b0) // visible on the first frame only
out[pos] = texSrc0(y - _v * theta, x - _u * theta);
else // visible on the second frame only
out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta));
}
NCVStatus BlendFrames(const Ncv32f *src0,
const Ncv32f *src1,
NCVStatus BlendFrames(Ncv32f *src0,
Ncv32f *src1,
const Ncv32f *ufi,
const Ncv32f *vfi,
const Ncv32f *ubi,
@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0,
Ncv32f theta,
Ncv32f *out)
{
tex_src1.addressMode[0] = cudaAddressModeClamp;
tex_src1.addressMode[1] = cudaAddressModeClamp;
tex_src1.filterMode = cudaFilterModeLinear;
tex_src1.normalized = false;
tex_src0.addressMode[0] = cudaAddressModeClamp;
tex_src0.addressMode[1] = cudaAddressModeClamp;
tex_src0.filterMode = cudaFilterModeLinear;
tex_src0.normalized = false;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
const Ncv32u pitch = stride * sizeof (float);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
cv::cudev::Texture<Ncv32f> texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear);
cv::cudev::Texture<Ncv32f> texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear);
dim3 threads (32, 4);
dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
(ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
//
//==============================================================================
texture <float, 2, cudaReadModeElementType> texSrc2D;
__forceinline__
__device__ float processLine(int spos,
float xmin,
float xmax,
int ixmin,
int ixmax,
float fxmin,
float cxmax)
__device__ float processLine(cv::cudev::TexturePtr<Ncv32f> tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax)
{
// first element
float wsum = 1.0f - xmin + fxmin;
float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
float sum = tex( spos) * (1.0f - xmin + fxmin);
spos++;
for (int ix = ixmin + 1; ix < ixmax; ++ix)
{
sum += tex1Dfetch(texSrc, spos);
sum += tex(spos);
spos++;
wsum += 1.0f;
}
sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
sum += tex(spos) * (cxmax - xmax);
wsum += cxmax - xmax;
return sum / wsum;
}
__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
Ncv32u srcStep,
NcvRect32u srcROI,
Ncv32f *dst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
__global__ void resizeSuperSample_32f(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep,
NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
{
// position within dst ROI
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
float wsum = 1.0f - yBegin + floorYBegin;
float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (1.0f - yBegin + floorYBegin);
pos += srcStep;
for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
{
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd);
pos += srcStep;
wsum += 1.0f;
}
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (ceilYEnd - yEnd);
wsum += ceilYEnd - yEnd;
sum /= wsum;
@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_)
}
__global__ void resizeBicubic(NcvSize32u srcSize,
NcvRect32u srcROI,
NcvSize32u dstSize,
Ncv32u dstStep,
Ncv32f *dst,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
__global__ void resizeBicubic(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
float wx = bicubicCoeff (xDist);
float wy = bicubicCoeff (yDist);
wx *= wy;
sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
sum += wx * texSrc(cy * dy, cx * dx);
wsum += wx;
}
}
@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
}
NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
NcvRect32u srcROI,
@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
if (interpolation == nppStSupersample)
{
// bind texture
cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
// invoke kernel
cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
dim3 ctaSize (32, 6);
dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,
(dstROI.height + ctaSize.y - 1) / ctaSize.y);
resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y);
resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
}
else if (interpolation == nppStBicubic)
{
texSrc2D.addressMode[0] = cudaAddressModeMirror;
texSrc2D.addressMode[1] = cudaAddressModeMirror;
texSrc2D.normalized = true;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
nSrcStep);
cv::cudev::Texture<float> texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror);
dim3 ctaSize (32, 6);
dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x,
(dstSize.height + ctaSize.y - 1) / ctaSize.y);
resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y);
resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
}
else
{

@ -46,29 +46,27 @@
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace optflowbm
{
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
__device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
__device__ int cmpBlocks(cv::cudev::TexturePtr<uchar> texCurr, cv::cudev::TexturePtr<uchar> texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize)
{
int s = 0;
for (int y = 0; y < blockSize.y; ++y)
{
for (int x = 0; x < blockSize.x; ++x)
s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x));
}
return s;
}
__global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
__global__ void calcOptFlowBM(cv::cudev::TexturePtr<uchar> texPrev, cv::cudev::TexturePtr<uchar> texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
const short2* ss, const int ssCount)
{
@ -90,7 +88,7 @@ namespace optflowbm
int dist = numeric_limits<int>::max();
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
int countMin = 1;
int sumx = offX;
@ -111,7 +109,7 @@ namespace optflowbm
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
{
const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
if (tmpDist < acceptLevel)
{
sumx = dx;
@ -151,16 +149,12 @@ namespace optflowbm
void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
{
bindTexture(&tex_prev, prev);
bindTexture(&tex_curr, curr);
cv::cudev::Texture<uchar> texPrev(prev);
cv::cudev::Texture<uchar> texCurr(curr);
const dim3 block(32, 8);
const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
maxX, maxY, acceptLevel, escapeLevel, ss, ssCount);
calcOptFlowBM<<<grid, block, 0, stream>>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel, escapeLevel, ss, ssCount);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}

@ -100,7 +100,8 @@ bool TestHypothesesGrow::process()
NCV_SKIP_COND_BEGIN
ncvAssertReturn(this->src.fill(h_vecSrc), false);
memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
*h_vecDst.ptr() = {};
NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);

@ -46,6 +46,7 @@
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/warp_shuffle.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
@ -825,64 +826,57 @@ namespace cv { namespace cuda { namespace device
//-------------------------------------------------------------------
// Resize
texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
__global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar, float> src, float sx, float sy, PtrStepSz<uchar> dst)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
dst.ptr(y)[x] = src(y * sy, x * sx) * 255;
}
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
__global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar4, float4> src, float sx, float sy, PtrStepSz<uchar4> dst)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
float4 val = src(y * sy, x * sx);
dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
}
}
template<class T, class TEX>
static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
static void resize_for_hog_8UC1(const PtrStepSzb& src, PtrStepSzb dst)
{
tex.filterMode = cudaFilterModeLinear;
size_t texOfs = 0;
int colOfs = 0;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
if (texOfs != 0)
{
colOfs = static_cast<int>( texOfs/sizeof(T) );
cudaSafeCall( cudaUnbindTexture(tex) );
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
}
cv::cudev::Texture<uchar,float> tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows;
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar>)dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
static void resize_for_hog_8UC4(const PtrStepSzb& src, PtrStepSzb dst)
{
cv::cudev::Texture<uchar4, float4> tex(src.rows, src.cols, reinterpret_cast<uchar4*>(src.data), src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows;
cudaSafeCall( cudaUnbindTexture(tex) );
resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar4>)dst);
cudaSafeCall(cudaGetLastError());
cudaSafeCall(cudaDeviceSynchronize());
}
void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC1(src, dst); }
void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC4(src, dst); }
} // namespace hog
}}} // namespace cv { namespace cuda { namespace cudev

@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
*/
//============== caltech hog tests =====================//
struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string, bool>>
{
cv::cuda::DeviceInfo devInfo;
cv::Mat img;
@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std
devInfo = GET_PARAM(0);
cv::cuda::setDevice(devInfo.deviceID());
img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
const bool grayScale = GET_PARAM(2);
if(grayScale)
img = readImage(GET_PARAM(1), IMREAD_GRAYSCALE);
else {
Mat imgBgr = readImage(GET_PARAM(1));
cv::cvtColor(imgBgr, img, COLOR_BGR2BGRA);
}
ASSERT_FALSE(img.empty());
}
};
@ -263,10 +269,11 @@ CUDA_TEST_P(CalTech, HOG)
#endif
}
#define GREYSCALE true, false
INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
"caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
"caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
"caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE)));
//------------------------variable GPU HOG Tests------------------------//

@ -50,8 +50,7 @@
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include <iostream>
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda;
using namespace cv::cuda::device;
@ -64,224 +63,6 @@ namespace pyrlk
__constant__ int c_halfWin_y;
__constant__ int c_iters;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn, typename T> struct Tex_I
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_I8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
{
bindTexture(&tex_I8U, I);
}
};
template <> struct Tex_I<1, ushort>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, int>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<1, float>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_If, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
{
bindTexture(&tex_If, I);
}
};
// ****************** 3 channel specializations ************************
template <> struct Tex_I<3, uchar>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0,0,0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, ushort>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, int>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
{
CV_UNUSED(I);
}
};
template <> struct Tex_I<3, float>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
{
CV_UNUSED(I);
}
};
// ****************** 4 channel specializations ************************
template <> struct Tex_I<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
{
bindTexture(&tex_I8UC4, I);
}
};
template <> struct Tex_I<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
{
bindTexture(&tex_I16UC4, I);
}
};
template <> struct Tex_I<4, float>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_If4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
{
bindTexture(&tex_If4, I);
}
};
// ************* J ***************
template <int cn, typename T> struct Tex_J
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
{
CV_UNUSED(J);
}
};
template <> struct Tex_J<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_J8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
{
bindTexture(&tex_J8U, J);
}
};
template <> struct Tex_J<1, float>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_Jf, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
{
bindTexture(&tex_Jf, J);
}
};
// ************* 4 channel specializations ***************
template <> struct Tex_J<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
{
bindTexture(&tex_J8UC4, J);
}
};
template <> struct Tex_J<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
{
bindTexture(&tex_J16UC4, J);
}
};
template <> struct Tex_J<4, float>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_Jf4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
{
bindTexture(&tex_Jf4, J);
}
};
__device__ __forceinline__ void accum(float& dst, const float& val)
{
dst += val;
@ -364,8 +145,8 @@ namespace pyrlk
}
};
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
__global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T, class Ptr2D>
__global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
@ -413,15 +194,14 @@ namespace pyrlk
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn, T>::read(x, y);
I_patch[i][j] = texI(y, x);
// Scharr Deriv
work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) -
(3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1));
work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) -
(3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1));
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
@ -490,7 +270,8 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
work_type diff = (J_val - I_val) * 32.0f;
@ -533,7 +314,8 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
work_type diff = J_val - I_val;
@ -749,6 +531,27 @@ namespace pyrlk
}
} // __global__ void sparseKernel_
// Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat.
template<int cn, class T> class TextureLinear : public cv::cudev::Texture<typename TypeVec<T, cn>::vec_type, typename TypeVec<float, cn>::vec_type> {
public:
typedef typename TypeVec<T, cn>::vec_type elem_type;
typedef typename TypeVec<float, cn>::vec_type ret_type;
__host__ TextureLinear(PtrStepSz<elem_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
cv::cudev::Texture<elem_type, ret_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat)
{
}
};
// Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType.
template<int cn> class TextureLinear<cn, float> : public cv::cudev::Texture<typename TypeVec<float, cn>::vec_type, typename TypeVec<float, cn>::vec_type>
{
public:
typedef typename TypeVec<float, cn>::vec_type float_type;
__host__ TextureLinear(PtrStepSz<float_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
cv::cudev::Texture <float_type, float_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType)
{
}
};
template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
{
@ -756,16 +559,16 @@ namespace pyrlk
static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type dType;
typedef typename TypeVec<float, cn>::vec_type rType;
TextureLinear<cn,T> texI(I);
TextureLinear<cn,T> texJ(J);
dim3 grid(ptcount);
CV_UNUSED(I);
CV_UNUSED(J);
if (level == 0 && err)
sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
sparseKernel<cn, PATCH_X, PATCH_Y, true, T, cv::cudev::TexturePtr<dType,rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
else
sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
sparseKernel<cn, PATCH_X, PATCH_Y, false, T, cv::cudev::TexturePtr<dType, rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
@ -903,8 +706,8 @@ namespace pyrlk
};
template <bool calcErr>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
template <bool calcErr, class Ptr2D>
__global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{
extern __shared__ int smem[];
@ -925,15 +728,15 @@ namespace pyrlk
float x = xBase - c_halfWin_x + j + 0.5f;
float y = yBase - c_halfWin_y + i + 0.5f;
I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
I_patch[i * patchWidth + j] = texI(y, x);
// Scharr Deriv
dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
(3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) -
(3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1));
dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
(3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) -
(3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1));
}
}
@ -1004,7 +807,7 @@ namespace pyrlk
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
int diff = (J - I) * 32;
@ -1040,7 +843,8 @@ namespace pyrlk
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
errval += ::abs(J - I);
}
@ -1109,9 +913,6 @@ namespace pyrlk
{ sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
};
Tex_I<cn, T>::bindTexture_(I);
Tex_J<cn, T>::bindTexture_(J);
funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
@ -1119,9 +920,8 @@ namespace pyrlk
{
dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
Tex_I<1, T>::bindTexture_(I);
Tex_J<1, T>::bindTexture_(J);
TextureLinear<1, T> texI(I);
TextureLinear<1, T> texJ(J);
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x;
const int patchHeight = block.y + 2 * halfWin.y;
@ -1129,12 +929,12 @@ namespace pyrlk
if (err.data)
{
denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
denseKernel<true, cv::cudev::TexturePtr<T,float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols);
cudaSafeCall(cudaGetLastError());
}
else
{
denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
denseKernel<false, cv::cudev::TexturePtr<T, float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall(cudaGetLastError());
}

@ -46,6 +46,7 @@
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
using namespace cv::cuda;
using namespace cv::cuda::device;
@ -102,63 +103,8 @@ namespace tvl1flow
}
}
struct SrcTex
{
virtual ~SrcTex() {}
__device__ __forceinline__ virtual float I1(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
};
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
{
return tex2D(tex_I1y, x, y);
}
};
struct SrcTexObj : SrcTex
{
__host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
: tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
__device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
{
return tex2D<float>(tex_obj_I1y, x, y);
}
cudaTextureObject_t tex_obj_I1;
cudaTextureObject_t tex_obj_I1x;
cudaTextureObject_t tex_obj_I1y;
};
template <
typename T,
typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
>
__global__ void warpBackwardKernel(
const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
const PtrStepSzf I0, const cv::cudev::TexturePtr<float> I1, const cv::cudev::TexturePtr<float> I1x, const cv::cudev::TexturePtr<float> I1y, const PtrStepf u1, const PtrStepf u2,
PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -189,11 +135,9 @@ namespace tvl1flow
for (int cx = xmin; cx <= xmax; ++cx)
{
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
sum += w * src.I1(cx, cy);
sumx += w * src.I1x(cx, cy);
sumy += w * src.I1y(cx, cy);
sum += w * I1(cy, cx);
sumx += w * I1x(cy, cx);
sumy += w * I1y(cy, cx);
wsum += w;
}
}
@ -224,49 +168,14 @@ namespace tvl1flow
PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
cudaStream_t stream)
{
cv::cudev::Texture<float> texI1(I1);
cv::cudev::Texture<float> texI1x(I1x);
cv::cudev::Texture<float> texI1y(I1y);
const dim3 block(32, 8);
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (cc30)
{
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
else
cudaSafeCall(cudaStreamSynchronize(stream));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
}
else
{
bindTexture(&tex_I1, I1);
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
}
warpBackwardKernel<<<grid, block, 0, stream>>>(I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho);
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
}
}

@ -43,8 +43,10 @@
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
#include <limits.h>
namespace cv { namespace cuda { namespace device
{
namespace stereobm
@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ __forceinline__ float sobel(int x, int y)
__device__ __forceinline__ float sobel(cv::cudev::TexturePtr<uchar, float> texSrc, int x, int y)
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) +
texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) +
texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1);
return fabs(conv);
}
@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device
#define RpT (2 * ROWSperTHREAD) // got experimentally
__global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
__global__ void textureness_kernel(cv::cudev::TexturePtr<uchar,float> texSrc, PtrStepSzb disp, int winsz, float threshold)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(x - winsz2, i);
sum += sobel(texSrc, x - winsz2, i);
if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i);
sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i);
}
*cols = sum;
if (cols_extra)
@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra;
}
@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
{
avgTexturenessThreshold *= winsz * winsz;
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
cv::cudev::Texture<unsigned char, float> tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat);
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
textureness_kernel<<<grid, threads, smem_size, stream>>>(tex, disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
} // namespace stereobm
}}} // namespace cv { namespace cuda { namespace cudev

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
@ -77,8 +78,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
@ -98,8 +99,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device
}
};
#define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff, yoff; \
tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStreamTex
{
static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
PtrStepSz< T > dst, const float* borderValue, bool cc20)
{
typedef typename TypeVec<float, VecTraits< T >::cn>::vec_type work_type;
dim3 block(32, cc20 ? 8 : 4);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
{
cudev::Texture<T> texSrcWhole(srcWhole);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader<cudev::TexturePtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4)
}
else {
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
remap<<<grid, block >>>(filter_src, mapx, mapy, dst);
}
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4)
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4)
template <template <typename> class Filter, typename T> struct RemapDispatcherNonStreamTex<Filter, BrdReplicate, T>
{
static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
PtrStepSz< T > dst, const float*, bool)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
{
cudev::Texture<T> texSrcWhole(srcWhole);
Filter<cudev::TexturePtr<T>> filter_src(texSrcWhole);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
}
else
{
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
//OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar> :
RemapDispatcherNonStreamTex<Filter, B, uchar> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar4> :
RemapDispatcherNonStreamTex<Filter, B, uchar4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort> :
RemapDispatcherNonStreamTex<Filter, B, ushort> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort4> :
RemapDispatcherNonStreamTex<Filter, B, ushort4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short> :
RemapDispatcherNonStreamTex<Filter, B, short> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short4> :
RemapDispatcherNonStreamTex<Filter, B, short4> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float> :
RemapDispatcherNonStreamTex<Filter, B, float> {};
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float4> :
RemapDispatcherNonStreamTex<Filter, B, float4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, short> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, short4> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, float> {};
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float4> :
RemapDispatcherNonStreamTex<Filter, BrdReplicate, float4> {};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
@ -234,37 +243,23 @@ namespace cv { namespace cuda { namespace device
}
};
callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
callers[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff, xmap, ymap,
static_cast<PtrStepSz<T>>(dst), borderValue, stream, cc20);
}
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc

@ -49,6 +49,7 @@
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
@ -105,7 +106,7 @@ namespace cv { namespace cuda { namespace device
}
}
template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
template <class Ptr2D, typename T> __global__ void resize(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
{
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
@ -130,54 +131,6 @@ namespace cv { namespace cuda { namespace device
}
}
// textures
template <typename T> struct TextureAccessor;
#define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
template <> struct TextureAccessor<type> \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff; \
int yoff; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_##type, x + xoff, y + yoff); \
} \
__host__ static void bind(const PtrStepSz<type>& mat) \
{ \
bindTexture(&tex_resize_##type, mat); \
} \
};
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
template <typename T>
TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
{
TextureAccessor<T>::bind(mat);
TextureAccessor<T> t;
t.xoff = xoff;
t.yoff = yoff;
return t;
}
// callers for nearest interpolation
template <typename T>
@ -194,14 +147,19 @@ namespace cv { namespace cuda { namespace device
}
template <typename T>
void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
void call_resize_nearest_tex(const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
if (xoff || yoff) {
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
resize<cudev::TextureOffPtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
}
else {
cudev::Texture<T> texSrcWhole(srcWhole);
resize<cudev::TexturePtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
@ -225,27 +183,21 @@ namespace cv { namespace cuda { namespace device
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.data == src.data)
{
TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
cudev::Texture<T> texSrc(src);
LinearFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
}
else
{
TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
LinearFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
@ -258,8 +210,8 @@ namespace cv { namespace cuda { namespace device
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
BorderReader<PtrStep<T>, BrdReplicate<T>> brdSrc(src, brd);
CubicFilter<BorderReader< PtrStep<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
cudaSafeCall( cudaGetLastError() );
@ -273,27 +225,21 @@ namespace cv { namespace cuda { namespace device
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (srcWhole.data == src.data)
{
TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
cudev::Texture<T> texSrc(src);
CubicFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
}
else
{
TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
CubicFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
@ -318,7 +264,7 @@ namespace cv { namespace cuda { namespace device
if (fx > 1 || fy > 1)
call_resize_nearest_glob(src, dst, fy, fx, 0);
else
call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
call_resize_nearest_tex(srcWhole, yoff, xoff, dst, fy, fx);
}
}
};
@ -389,7 +335,7 @@ namespace cv { namespace cuda { namespace device
{
if (stream)
call_resize_cubic_glob(src, dst, fy, fx, stream);
else
else
call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
}
};
@ -421,16 +367,16 @@ namespace cv { namespace cuda { namespace device
if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
{
BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
IntegerAreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
}
else
{
BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
AreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
}

@ -48,6 +48,7 @@
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
@ -164,8 +165,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() );
@ -186,8 +187,8 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
cudaSafeCall( cudaGetLastError() );
@ -196,86 +197,48 @@ namespace cv { namespace cuda { namespace device
}
};
#define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \
texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_warp_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff, yoff; \
tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
} \
}; \
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
//OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStreamTex
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, cc20 ? 8 : 4);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (xoff || yoff) {
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
}
else {
cudev::Texture<T> texSrcWhole(srcWhole);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader<cudev::TexturePtr<T>, B<work_type>>brdSrc(texSrcWhole, brd);
Filter< BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#undef OPENCV_CUDA_IMPLEMENT_WARP_TEX
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar> :
WarpDispatcherNonStreamTex<Transform, Filter, B, uchar> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, uchar4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort> :
WarpDispatcherNonStreamTex<Transform, Filter, B, ushort> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, ushort4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short> :
WarpDispatcherNonStreamTex<Transform, Filter, B, short> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, short4> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float> :
WarpDispatcherNonStreamTex<Transform, Filter, B, float> {};
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float4> :
WarpDispatcherNonStreamTex<Transform, Filter, B, float4> {};
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{
@ -319,8 +282,8 @@ namespace cv { namespace cuda { namespace device
}
};
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20);
funcs[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff,
static_cast<PtrStepSz<T>>(dst), borderValue, warpMat, stream, cc20);
}
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
@ -330,32 +293,18 @@ namespace cv { namespace cuda { namespace device
}
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -366,32 +315,18 @@ namespace cv { namespace cuda { namespace device
}
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc

@ -42,6 +42,8 @@
#ifndef __OPENCV_TEST_PRECOMP_HPP__
#define __OPENCV_TEST_PRECOMP_HPP__
#include <thread>
#include "opencv2/ts.hpp"
#include "opencv2/ts/cuda_test.hpp"

@ -206,6 +206,60 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
WHOLE_SUBMAT));
PARAM_TEST_CASE(ResizeTextures, cv::cuda::DeviceInfo, Interpolation)
{
cv::cuda::DeviceInfo devInfo;
Interpolation interpolation;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
interpolation = GET_PARAM(1);
cv::cuda::setDevice(devInfo.deviceID());
}
};
void ResizeThread(const Interpolation interp, const GpuMat& imgIn, const std::vector<GpuMat>& imgsOut, Stream& stream) {
for (auto& imgOut : imgsOut)
cv::cuda::resize(imgIn, imgOut, imgOut.size(), 0, 0, interp, stream);
}
CUDA_TEST_P(ResizeTextures, Accuracy)
{
constexpr int nThreads = 5;
constexpr int nIters = 5;
const Size szIn(100, 100);
const Size szOut(200, 200);
vector<Stream> streams(nThreads, cv::cuda::Stream::Null());
vector<GpuMat> imgsIn;
vector<vector<GpuMat>> imgsOut;
for (int i = 0; i < nThreads; i++) {
imgsIn.push_back(GpuMat(szIn, CV_8UC1, i));
vector<GpuMat> imgsOutPerThread;
for (int j = 0; j < nIters; j++)
imgsOutPerThread.push_back(GpuMat(szOut, CV_8UC1));
imgsOut.push_back(imgsOutPerThread);
}
vector<std::thread> thread(nThreads);
for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(ResizeThread, interpolation, std::ref(imgsIn.at(i)), std::ref(imgsOut.at(i)), std::ref(streams.at(i)));
for (int i = 0; i < nThreads; i++) thread.at(i).join();
for (int i = 0; i < nThreads; i++) {
GpuMat imgOutGs;
cv::cuda::resize(imgsIn.at(i), imgOutGs, szOut, 0, 0, interpolation, streams.at(i));
Mat imgOutGsHost; imgOutGs.download(imgOutGsHost);
for (const auto& imgOut : imgsOut.at(i)) {
Mat imgOutHost; imgOut.download(imgOutHost);
ASSERT_TRUE(cv::norm(imgOutHost, imgOutGsHost, NORM_INF) == 0);
}
}
}
INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
ALL_DEVICES,
testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
}} // namespace
#endif // HAVE_CUDA

@ -1,147 +1,159 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma once
#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
#define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
#include <cstring>
#include "../common.hpp"
#include "glob.hpp"
#include "gpumat.hpp"
#include "traits.hpp"
#if CUDART_VERSION >= 5050
namespace
{
template <typename T> struct CvCudevTextureRef
{
typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
static TexRef ref;
#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
#define OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
__host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat,
bool normalizedCoords = false,
cudaTextureFilterMode filterMode = cudaFilterModePoint,
cudaTextureAddressMode addressMode = cudaAddressModeClamp)
{
ref.normalized = normalizedCoords;
ref.filterMode = filterMode;
ref.addressMode[0] = addressMode;
ref.addressMode[1] = addressMode;
ref.addressMode[2] = addressMode;
#include <opencv2/core.hpp>
#include <opencv2/core/utils/logger.hpp>
#include <opencv2/core/cuda_types.hpp>
#include <opencv2/cudev/common.hpp>
#include <opencv2/cudev/ptr2d/traits.hpp>
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
/** \file texture.hpp
*/
CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) );
namespace cv { namespace cudev {
//! @addtogroup cudev
//! @{
/** @brief Simple lightweight structures that encapsulate information about an image texture on the device.
* They are intended to be passed to nvcc-compiled code.
*/
template<class T, class R = T>
struct TexturePtr {
typedef R elem_type, value_type;
typedef float index_type;
__host__ TexturePtr() {};
__host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
return tex2D<R>(tex, x, y);
}
__device__ __forceinline__ R operator ()(index_type x) const {
return tex1Dfetch<R>(tex, x);
}
private:
cudaTextureObject_t tex;
};
__host__ static void unbind()
{
cudaUnbindTexture(ref);
// textures are a maximum of 32 bits wide, 64 bits is read as two 32 bit wide values
template <class R>
struct TexturePtr<uint64, R> {
typedef float index_type;
__host__ TexturePtr() {};
__host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
const uint2 retVal = tex2D<uint2>(tex, x, y);
return *(reinterpret_cast<const R*>(&retVal));
}
__device__ __forceinline__ R operator ()(index_type x) const {
const uint2 retVal = tex1Dfetch<uint2>(tex, x);
return *(reinterpret_cast<const R*>(&retVal));
}
private:
cudaTextureObject_t tex;
};
template <typename T>
typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref;
}
template<class T, class R = T>
struct TextureOffPtr {
typedef R elem_type;
typedef float index_type;
__host__ TextureOffPtr(const cudaTextureObject_t tex_, const int yoff_, const int xoff_) : tex(tex_), yoff(yoff_), xoff(xoff_) {};
__device__ __forceinline__ R operator ()(index_type y, index_type x) const {
return tex2D<R>(tex, x + xoff, y + yoff);
}
private:
cudaTextureObject_t tex;
int xoff = 0;
int yoff = 0;
};
#endif
/** @brief non-copyable smart CUDA texture object
*
* UniqueTexture is a smart non-sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class UniqueTexture {
public:
__host__ UniqueTexture() noexcept { }
__host__ UniqueTexture(UniqueTexture&) = delete;
__host__ UniqueTexture(UniqueTexture&& other) noexcept {
tex = other.tex;
other.tex = 0;
}
namespace cv { namespace cudev {
__host__ UniqueTexture(const int rows, const int cols, T* data, const size_t step, const bool normalizedCoords = false,
const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
const cudaTextureReadMode readMode = cudaReadModeElementType)
{
create(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode);
}
//! @addtogroup cudev
//! @{
__host__ UniqueTexture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType)
{
create(1, static_cast<int>(sizeInBytes/sizeof(T)), data, sizeInBytes, normalizedCoords, filterMode, addressMode, readMode);
}
#if CUDART_VERSION >= 5050
__host__ ~UniqueTexture() {
if (tex != cudaTextureObject_t()) {
try {
CV_CUDEV_SAFE_CALL(cudaDestroyTextureObject(tex));
}
catch (const cv::Exception& ex) {
std::ostringstream os;
os << "Exception caught during CUDA texture object destruction.\n";
os << ex.what();
os << "Exception will be ignored.\n";
CV_LOG_WARNING(0, os.str().c_str());
}
}
template <typename T> struct TexturePtr
{
typedef T value_type;
typedef float index_type;
}
cudaTextureObject_t texObj;
__host__ UniqueTexture& operator=(const UniqueTexture&) = delete;
__host__ UniqueTexture& operator=(UniqueTexture&& other) noexcept {
CV_Assert(other);
if (&other != this) {
UniqueTexture(std::move(*this)); /* destroy current texture object */
tex = other.tex;
other.tex = cudaTextureObject_t();
}
return *this;
}
__device__ __forceinline__ T operator ()(float y, float x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return tex2D(CvCudevTextureRef<T>::ref, x, y);
#else
// Use the texture object
return tex2D<T>(texObj, x, y);
#endif
}
};
template <typename T> struct Texture : TexturePtr<T>
{
int rows, cols;
bool cc30;
__host__ explicit Texture(const GlobPtrSz<T>& mat,
bool normalizedCoords = false,
cudaTextureFilterMode filterMode = cudaFilterModePoint,
cudaTextureAddressMode addressMode = cudaAddressModeClamp)
{
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
__host__ cudaTextureObject_t get() const noexcept {
CV_Assert(tex);
return tex;
}
__host__ explicit operator bool() const noexcept { return tex != cudaTextureObject_t(); }
rows = mat.rows;
cols = mat.cols;
private:
if (cc30)
template <class T1>
__host__ void create(const int rows, const int cols, T1* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
{
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = mat.data;
texRes.res.pitch2D.height = mat.rows;
texRes.res.pitch2D.width = mat.cols;
texRes.res.pitch2D.pitchInBytes = mat.step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
if (rows == 1) {
CV_Assert(rows == 1 && cols*sizeof(T) == step);
texRes.resType = cudaResourceTypeLinear;
texRes.res.linear.devPtr = data;
texRes.res.linear.sizeInBytes = step;
texRes.res.linear.desc = cudaCreateChannelDesc<T1>();
}
else {
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = data;
texRes.res.pitch2D.height = rows;
texRes.res.pitch2D.width = cols;
texRes.res.pitch2D.pitchInBytes = step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T1>();
}
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
@ -150,109 +162,112 @@ template <typename T> struct Texture : TexturePtr<T>
texDescr.addressMode[0] = addressMode;
texDescr.addressMode[1] = addressMode;
texDescr.addressMode[2] = addressMode;
texDescr.readMode = cudaReadModeElementType;
texDescr.readMode = readMode;
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
CV_CUDEV_SAFE_CALL(cudaCreateTextureObject(&tex, &texRes, &texDescr, 0));
}
else
__host__ void create(const int rows, const int cols, uint64* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
{
// Use the texture reference
CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
create<uint2>(rows, cols, (uint2*)data, step, normalizedCoords, filterMode, addressMode, readMode);
}
}
__host__ ~Texture()
{
if (cc30)
private:
cudaTextureObject_t tex;
};
/** @brief sharable smart CUDA texture object
*
* Texture is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class Texture {
public:
Texture() = default;
Texture(const Texture&) = default;
Texture(Texture&&) = default;
__host__ Texture(const int rows_, const int cols_, T* data, const size_t step, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
rows(rows_), cols(cols_), texture(std::make_shared<UniqueTexture<T,R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode))
{
// Use the texture object
cudaDestroyTextureObject(this->texObj);
}
else
__host__ Texture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
rows(1), cols(static_cast<int>(sizeInBytes/sizeof(T))), texture(std::make_shared<UniqueTexture<T, R>>(sizeInBytes, data, normalizedCoords, filterMode, addressMode, readMode))
{
// Use the texture reference
CvCudevTextureRef<T>::unbind();
}
}
};
template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
{
};
__host__ Texture(PtrStepSz<T> src, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
Texture(src.rows, src.cols, src.data, src.step, normalizedCoords, filterMode, addressMode, readMode)
{
}
#else
Texture& operator=(const Texture&) = default;
Texture& operator=(Texture&&) = default;
template <typename T> struct TexturePtr
{
typedef T value_type;
typedef float index_type;
__host__ explicit operator bool() const noexcept {
if (!texture)
return false;
return texture->operator bool();
}
cudaTextureObject_t texObj;
__host__ operator TexturePtr<T, R>() const {
if (texture)
return TexturePtr<T, R>(texture->get());
else
return TexturePtr<T, R>(cudaTextureObject_t());
}
__device__ __forceinline__ T operator ()(float y, float x) const
{
#if CV_CUDEV_ARCH >= 300
// Use the texture object
return tex2D<T>(texObj, x, y);
#else
CV_UNUSED(y);
CV_UNUSED(x);
return T();
#endif
}
};
template <typename T> struct Texture : TexturePtr<T>
{
int rows, cols;
__host__ explicit Texture(const GlobPtrSz<T>& mat,
bool normalizedCoords = false,
cudaTextureFilterMode filterMode = cudaFilterModePoint,
cudaTextureAddressMode addressMode = cudaAddressModeClamp)
{
CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
rows = mat.rows;
cols = mat.cols;
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = mat.data;
texRes.res.pitch2D.height = mat.rows;
texRes.res.pitch2D.width = mat.cols;
texRes.res.pitch2D.pitchInBytes = mat.step;
texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
texDescr.normalizedCoords = normalizedCoords;
texDescr.filterMode = filterMode;
texDescr.addressMode[0] = addressMode;
texDescr.addressMode[1] = addressMode;
texDescr.addressMode[2] = addressMode;
texDescr.readMode = cudaReadModeElementType;
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
}
__host__ ~Texture()
int rows = 0;
int cols = 0;
protected:
std::shared_ptr<UniqueTexture<T, R>> texture = 0;
};
template <typename T, typename R> struct PtrTraits<Texture<T, R>> : PtrTraitsBase<Texture<T, R>, TexturePtr<T, R>>
{
// Use the texture object
cudaDestroyTextureObject(this->texObj);
}
};
};
template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
{
};
#endif
/** @brief sharable smart CUDA texture object with offset
* TextureOff is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
*/
template<class T, class R = T>
class TextureOff {
public:
TextureOff(const TextureOff&) = default;
TextureOff(TextureOff&&) = default;
__host__ TextureOff(const int rows, const int cols, T* data, const size_t step, const int yoff_ = 0, const int xoff_ = 0, const bool normalizedCoords = false,
const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
const cudaTextureReadMode readMode = cudaReadModeElementType) :
texture(std::make_shared<UniqueTexture<T, R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode)), xoff(xoff_), yoff(yoff_)
{
}
//! @}
__host__ TextureOff(PtrStepSz<T> src, const int yoff = 0, const int xoff = 0, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
TextureOff(src.rows, src.cols, src.data, src.step, yoff, xoff, normalizedCoords, filterMode, addressMode, readMode)
{
}
TextureOff& operator=(const TextureOff&) = default;
TextureOff& operator=(TextureOff&&) = default;
__host__ operator TextureOffPtr<T, R>() const {
return TextureOffPtr<T, R>(texture->get(), yoff, xoff);
}
private:
int xoff = 0;
int yoff = 0;
std::shared_ptr<UniqueTexture<T, R>> texture = 0;
};
}}
#endif

@ -213,7 +213,7 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
return __hiloint2double(hi, lo);
}
__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
__device__ __forceinline__ uint64 shfl_up(uint64 val, uint delta, int width = warpSize)
{
return __shfl_up(val, delta, width);
}

@ -51,6 +51,7 @@
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include <opencv2/cudev/ptr2d/texture.hpp>
namespace cv { namespace cuda { namespace device
{
@ -59,23 +60,19 @@ namespace cv { namespace cuda { namespace device
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(PtrStepSzb img);
size_t bindSumTex(PtrStepSz<unsigned int> sum);
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}
}}}
@ -121,34 +118,8 @@ namespace cv { namespace cuda { namespace device
cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
}
////////////////////////////////////////////////////////////////////////
// Integral image texture
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
void bindImgTex(PtrStepSzb img)
{
bindTexture(&imgTex, img);
}
size_t bindSumTex(PtrStepSz<uint> sum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
return offset / sizeof(uint);
}
size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
return offset / sizeof(uint);
}
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
template <int N> __device__ float icvCalcHaarPatternSum(cudev::TexturePtr<unsigned int> texSum, const float src[][5], int oldSize, int newSize, int y, int x)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
typedef double real_t;
@ -169,10 +140,10 @@ namespace cv { namespace cuda { namespace device
int dy2 = __float2int_rn(ratio * src[k][3]);
real_t t = 0;
t += tex2D(sumTex, x + dx1, y + dy1);
t -= tex2D(sumTex, x + dx1, y + dy2);
t -= tex2D(sumTex, x + dx2, y + dy1);
t += tex2D(sumTex, x + dx2, y + dy2);
t += texSum(y + dy1, x + dx1);
t -= texSum(y + dy2, x + dx1);
t -= texSum(y + dy1, x + dx2);
t += texSum(y + dy2, x + dx2);
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@ -201,7 +172,7 @@ namespace cv { namespace cuda { namespace device
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
__global__ void icvCalcLayerDetAndTrace(cudev::TexturePtr<unsigned int> texSum, PtrStepf det, PtrStepf trace)
{
// Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
@ -222,29 +193,29 @@ namespace cv { namespace cuda { namespace device
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
const float dx = icvCalcHaarPatternSum<3>(texSum, c_DX , 9, size, (i << c_octave), (j << c_octave));
const float dy = icvCalcHaarPatternSum<3>(texSum, c_DY , 9, size, (i << c_octave), (j << c_octave));
const float dxy = icvCalcHaarPatternSum<4>(texSum, c_DXY, 9, size, (i << c_octave), (j << c_octave));
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
}
}
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayers)
{
const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
cudev::Texture<unsigned int> texSum(sum);
dim3 threads(16, 16);
dim3 grid;
grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
icvCalcLayerDetAndTrace<<<grid, threads>>>(texSum, det, trace);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
@ -255,10 +226,14 @@ namespace cv { namespace cuda { namespace device
__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
struct WithMask
template<bool useMask = true>
struct Mask
{
static __device__ bool check(int sum_i, int sum_j, int size)
__host__ Mask(){};
__host__ Mask(cudev::TexturePtr<unsigned int> tex_): tex(tex_) {};
__device__ bool check(int sum_i, int sum_j, int size)
{
if (!useMask) return true;
float ratio = (float)size / 9.0f;
float d = 0;
@ -269,19 +244,20 @@ namespace cv { namespace cuda { namespace device
int dy2 = __float2int_rn(ratio * c_DM[3]);
float t = 0;
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
t += tex(sum_i + dy1, sum_j + dx1);
t -= tex(sum_i + dy2, sum_j + dx1);
t -= tex(sum_i + dy1, sum_j + dx2);
t += tex(sum_i + dy2, sum_j + dx2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f);
}
cudev::TexturePtr<unsigned int> tex;
};
template <typename Mask>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
template<class T>
__global__ void icvFindMaximaInLayer(T mask, const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
unsigned int* maxCounter)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
@ -323,7 +299,7 @@ namespace cv { namespace cuda { namespace device
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (Mask::check(sum_i, sum_j, size))
if (mask.check(sum_i, sum_j, size))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
@ -374,7 +350,7 @@ namespace cv { namespace cuda { namespace device
#endif
}
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{
const int layer_rows = img_rows >> octave;
@ -390,10 +366,15 @@ namespace cv { namespace cuda { namespace device
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask)
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
else
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
if (use_mask) {
cudev::Texture<unsigned int> texMaskSum(maskSum);
Mask<true> mask(texMaskSum);
icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
}
else {
Mask<false> mask;
icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
}
cudaSafeCall( cudaGetLastError() );
@ -539,7 +520,7 @@ namespace cv { namespace cuda { namespace device
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
__global__ void icvCalcOrientation(cudev::TexturePtr<unsigned int> texSum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{
__shared__ float s_X[128];
__shared__ float s_Y[128];
@ -576,8 +557,8 @@ namespace cv { namespace cuda { namespace device
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NY, 4, grad_wav_size, y, x);
angle = atan2f(Y, X);
if (angle < 0)
@ -676,8 +657,9 @@ namespace cv { namespace cuda { namespace device
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{
cudev::Texture<unsigned int> texSum(sum);
dim3 threads;
threads.x = 32;
threads.y = 4;
@ -685,7 +667,7 @@ namespace cv { namespace cuda { namespace device
dim3 grid;
grid.x = nFeatures;
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
icvCalcOrientation<<<grid, threads>>>(texSum, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
@ -724,12 +706,14 @@ namespace cv { namespace cuda { namespace device
{
typedef uchar elem_type;
__device__ WinReader(cudev::TexturePtr<uchar> tex_) : tex(tex_) {};
__device__ __forceinline__ uchar operator ()(int i, int j) const
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return tex2D(imgTex, pixel_x, pixel_y);
return tex(pixel_y, pixel_x);
}
float centerX;
@ -739,19 +723,17 @@ namespace cv { namespace cuda { namespace device
float sin_dir;
int width;
int height;
cudev::TexturePtr<uchar> tex;
};
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy);
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
__device__ void calc_dx_dy(cudev::TexturePtr<uchar> tex, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy)
{
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
dx = dy = 0.0f;
WinReader win;
WinReader win(tex);
win.centerX = featureX[blockIdx.x];
win.centerY = featureY[blockIdx.x];
@ -813,14 +795,14 @@ namespace cv { namespace cuda { namespace device
}
}
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
__global__ void compute_descriptors_64(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
__shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32;
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
float dxabs = ::fabsf(dx);
float dyabs = ::fabsf(dy);
@ -839,14 +821,14 @@ namespace cv { namespace cuda { namespace device
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
}
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
__global__ void compute_descriptors_128(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
__shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32;
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
@ -925,13 +907,13 @@ namespace cv { namespace cuda { namespace device
descriptor_base[threadIdx.x] = val / s_len;
}
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
cudev::Texture<unsigned char> texImg(img);
if (descriptors.cols == 64)
{
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
@ -943,7 +925,7 @@ namespace cv { namespace cuda { namespace device
}
else
{
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );

@ -94,23 +94,19 @@ namespace cv { namespace cuda { namespace device
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(PtrStepSzb img);
size_t bindSumTex(PtrStepSz<unsigned int> sum);
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}
}}}
@ -138,10 +134,7 @@ namespace
class SURF_CUDA_Invoker
{
public:
SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img, const GpuMat& mask) :
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty())
SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img_, const GpuMat& mask) : surf_(surf), img(img_), img_cols(img_.cols), img_rows(img_.rows), use_mask(!mask.empty())
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -167,16 +160,12 @@ namespace
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
bindImgTex(img);
cuda::integral(img, surf_.sum);
sumOffset = bindSumTex(surf_.sum);
if (use_mask)
{
cuda::min(mask, 1.0, surf_.mask1);
cuda::integral(surf_.mask1, surf_.maskSum);
maskOffset = bindMaskSumTex(surf_.maskSum);
}
}
@ -195,9 +184,9 @@ namespace
const int layer_cols = img_cols >> octave;
loadOctaveConstants(octave, layer_rows, layer_cols);
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
icvCalcLayerDetAndTrace_gpu(surf_.sum, surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
icvFindMaximaInLayer_gpu(surf_.maskSum, surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
unsigned int maxCounter;
@ -230,7 +219,7 @@ namespace
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
icvCalcOrientation_gpu(surf_.sum, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
}
}
@ -241,7 +230,7 @@ namespace
if (nFeatures > 0)
{
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
compute_descriptors_gpu(img, descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
}
}
@ -252,6 +241,8 @@ namespace
SURF_CUDA& surf_;
GpuMat img;
int img_cols, img_rows;
bool use_mask;
@ -259,9 +250,6 @@ namespace
int maxCandidates;
int maxFeatures;
size_t maskOffset;
size_t sumOffset;
GpuMat counters;
};
}

Loading…
Cancel
Save