Merge pull request #2801 from Atlas42:cuda-hough-stream-fix

Added stream support on hough circles, lines and segments

* Added stream support on hough circles lines and segments
- Passed the stream to the different cuda, OpenCV and thurst library calls
- Replace all device by cuda synchronizes
- Added extra synchronize calls after device to host transfers
- Replaced the cuda globals by allocated values

* Fixed missing include for CUDA 8

Co-authored-by: william.fink <will@recurse.io>
pull/2803/head
Atlas42 4 years ago committed by GitHub
parent 6d5f440402
commit 33ae078b09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 20
      modules/cudaimgproc/src/cuda/build_point_list.cu
  2. 46
      modules/cudaimgproc/src/cuda/hough_circles.cu
  3. 33
      modules/cudaimgproc/src/cuda/hough_lines.cu
  4. 27
      modules/cudaimgproc/src/cuda/hough_segments.cu
  5. 45
      modules/cudaimgproc/src/hough_circles.cpp
  6. 39
      modules/cudaimgproc/src/hough_lines.cpp
  7. 36
      modules/cudaimgproc/src/hough_segments.cpp

@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device
{
namespace hough
{
__device__ int g_counter;
template <int PIXELS_PER_THREAD>
__global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
__global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)
{
__shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
__shared__ int s_qsize[4];
@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device
}
// calculate the offset in the global list
const int globalOffset = atomicAdd(&g_counter, totalSize);
const int globalOffset = atomicAdd(counterPtr, totalSize);
for (int i = 0; i < blockDim.y; ++i)
s_globStart[i] += globalOffset;
}
@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device
list[gidx] = s_queues[threadIdx.y][i];
}
int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)
{
const int PIXELS_PER_THREAD = 16;
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
const dim3 block(32, 4);
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
return totalCount;
}

@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
{
namespace hough_circles
{
__device__ int g_counter;
////////////////////////////////////////////////////////////////////////
// circlesAccumCenters
@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
}
}
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(count, block.x));
cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
circlesAccumCenters<<<grid, block, 0, stream>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaStreamSynchronize(stream) );
}
////////////////////////////////////////////////////////////////////////
// buildCentersList
__global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
__global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
{
const unsigned int val = (y << 16) | x;
const int idx = ::atomicAdd(&g_counter, 1);
const int idx = ::atomicAdd(counterPtr, 1);
centers[idx] = val;
}
}
}
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream)
{
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
buildCentersList<<<grid, block>>>(accum, centers, threshold);
buildCentersList<<<grid, block, 0, stream>>>(accum, centers, threshold, counterPtr);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
return totalCount;
}
@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device
__global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
float3* circles, const int maxCircles, const float dp,
const int minRadius, const int maxRadius, const int histSize, const int threshold)
const int minRadius, const int maxRadius, const int histSize, const int threshold,
int* counterPtr)
{
int* smem = DynamicSharedMem<int>();
@ -219,7 +213,7 @@ namespace cv { namespace cuda { namespace device
if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
{
const int ind = ::atomicAdd(&g_counter, 1);
const int ind = ::atomicAdd(counterPtr, 1);
if (ind < maxCircles)
circles[ind] = make_float3(cx, cy, i + minRadius);
}
@ -227,12 +221,9 @@ namespace cv { namespace cuda { namespace device
}
int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream)
{
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
const dim3 block(has20 ? 1024 : 512);
const dim3 grid(centersCount);
@ -240,13 +231,12 @@ namespace cv { namespace cuda { namespace device
const int histSize = maxRadius - minRadius + 1;
size_t smemSize = (histSize + 2) * sizeof(int);
circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
circlesAccumRadius<<<grid, block, smemSize, stream>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
totalCount = ::min(totalCount, maxCircles);

@ -44,6 +44,7 @@
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/emulation.hpp"
@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device
{
namespace hough_lines
{
__device__ int g_counter;
////////////////////////////////////////////////////////////////////////
// linesAccum
@ -126,7 +125,7 @@ namespace cv { namespace cuda { namespace device
accumRow[i] = smem[i];
}
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream)
{
const dim3 block(has20 ? 1024 : 512);
const dim3 grid(accum.rows - 2);
@ -134,19 +133,18 @@ namespace cv { namespace cuda { namespace device
size_t smemSize = (accum.cols - 1) * sizeof(int);
if (smemSize < sharedMemPerBlock - 1000)
linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
linesAccumShared<<<grid, block, smemSize, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
else
linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
linesAccumGlobal<<<grid, block, 0, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaStreamSynchronize(stream) );
}
////////////////////////////////////////////////////////////////////////
// linesGetResult
__global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
__global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr)
{
const int r = blockIdx.x * blockDim.x + threadIdx.x;
const int n = blockIdx.y * blockDim.y + threadIdx.y;
@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device
const float radius = (r - (numrho - 1) * 0.5f) * rho;
const float angle = n * theta;
const int ind = ::atomicAdd(&g_counter, 1);
const int ind = ::atomicAdd(counterPtr, 1);
if (ind < maxSize)
{
out[ind] = make_float2(radius, angle);
@ -174,25 +172,22 @@ namespace cv { namespace cuda { namespace device
}
}
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream)
{
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
linesGetResult<<<grid, block, 0, stream>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
totalCount = ::min(totalCount, maxSize);
@ -200,7 +195,7 @@ namespace cv { namespace cuda { namespace device
{
thrust::device_ptr<float2> outPtr(out);
thrust::device_ptr<int> votesPtr(votes);
thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
}
return totalCount;

@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device
{
namespace hough_segments
{
__device__ int g_counter;
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void houghLinesProbabilistic(const PtrStepSzi accum,
int4* out, const int maxSize,
const float rho, const float theta,
const int lineGap, const int lineLength,
const int rows, const int cols)
const int rows, const int cols,
int* counterPtr)
{
const int r = blockIdx.x * blockDim.x + threadIdx.x;
const int n = blockIdx.y * blockDim.y + threadIdx.y;
@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device
if (good_line)
{
const int ind = ::atomicAdd(&g_counter, 1);
const int ind = ::atomicAdd(counterPtr, 1);
if (ind < maxSize)
out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
}
@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device
if (good_line)
{
const int ind = ::atomicAdd(&g_counter, 1);
const int ind = ::atomicAdd(counterPtr, 1);
if (ind < maxSize)
out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
}
@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device
}
}
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream)
{
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
bindTexture(&tex_mask, mask);
houghLinesProbabilistic<<<grid, block>>>(accum,
houghLinesProbabilistic<<<grid, block, 0, stream>>>(accum,
out, maxSize,
rho, theta,
lineGap, lineLength,
mask.rows, mask.cols);
mask.rows, mask.cols,
counterPtr);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
totalCount = ::min(totalCount, maxSize);

@ -55,15 +55,15 @@ namespace cv { namespace cuda { namespace device
{
namespace hough
{
int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
}
namespace hough_circles
{
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream);
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream);
int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream);
}
}}}
@ -73,6 +73,7 @@ namespace
{
public:
HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
~HoughCirclesDetectorImpl();
void detect(InputArray src, OutputArray circles, Stream& stream);
@ -140,6 +141,8 @@ namespace
Ptr<cuda::Filter> filterDx_;
Ptr<cuda::Filter> filterDy_;
Ptr<cuda::CannyEdgeDetector> canny_;
int* counterPtr_;
};
bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);}
@ -153,16 +156,22 @@ namespace
filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
}
void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
HoughCirclesDetectorImpl::~HoughCirclesDetectorImpl()
{
// TODO : implement async version
CV_UNUSED(stream);
cudaSafeCall(cudaFree(counterPtr_));
}
void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
{
using namespace cv::cuda::device::hough;
using namespace cv::cuda::device::hough_circles;
auto cudaStream = StreamAccessor::getStream(stream);
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8UC1 );
@ -182,13 +191,13 @@ namespace
canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1));
canny_->setHighThreshold(cannyThreshold_);
canny_->detect(dx_, dy_, edges_);
canny_->detect(dx_, dy_, edges_, stream);
ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_);
unsigned int* srcPoints = list_.ptr<unsigned int>(0);
unsigned int* centers = list_.ptr<unsigned int>(1);
const int pointsCount = buildPointList_gpu(edges_, srcPoints);
const int pointsCount = buildPointList_gpu(edges_, srcPoints, counterPtr_, cudaStream);
if (pointsCount == 0)
{
circles.release();
@ -196,13 +205,13 @@ namespace
}
ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_);
accum_.setTo(Scalar::all(0));
accum_.setTo(Scalar::all(0), stream);
circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp, cudaStream);
accum_.download(tt);
accum_.download(tt, stream);
int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_, counterPtr_, cudaStream);
if (centersCount == 0)
{
circles.release();
@ -218,7 +227,8 @@ namespace
ushort2* oldBuf = oldBuf_.data();
ushort2* newBuf = newBuf_.data();
cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpyAsync(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost, cudaStream) );
cudaSafeCall( cudaStreamSynchronize(cudaStream) );
const int cellSize = cvRound(minDist_);
const int gridWidth = (src.cols + cellSize - 1) / cellSize;
@ -290,14 +300,15 @@ namespace
}
}
cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
cudaSafeCall( cudaMemcpyAsync(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice, cudaStream) );
centersCount = newCount;
}
ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_);
int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr<float3>(), maxCircles_,
dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20));
dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20),
counterPtr_, cudaStream);
if (circlesCount == 0)
{
@ -306,7 +317,7 @@ namespace
}
result_.cols = circlesCount;
result_.copyTo(circles);
result_.copyTo(circles, stream);
}
}

@ -55,13 +55,13 @@ namespace cv { namespace cuda { namespace device
{
namespace hough
{
int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
}
namespace hough_lines
{
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream);
}
}}}
@ -70,10 +70,8 @@ namespace
class HoughLinesDetectorImpl : public HoughLinesDetector
{
public:
HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
{
}
HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines);
~HoughLinesDetectorImpl();
void detect(InputArray src, OutputArray lines, Stream& stream);
void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
@ -124,16 +122,27 @@ namespace
GpuMat accum_;
GpuMat list_;
GpuMat result_;
int* counterPtr_;
};
void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
HoughLinesDetectorImpl::HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
{
cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
}
HoughLinesDetectorImpl::~HoughLinesDetectorImpl()
{
// TODO : implement async version
CV_UNUSED(stream);
cudaSafeCall(cudaFree(counterPtr_));
}
void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
{
using namespace cv::cuda::device::hough;
using namespace cv::cuda::device::hough_lines;
auto cudaStream = StreamAccessor::getStream(stream);
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8UC1 );
@ -143,7 +152,7 @@ namespace
ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
unsigned int* srcPoints = list_.ptr<unsigned int>();
const int pointsCount = buildPointList_gpu(src, srcPoints);
const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
if (pointsCount == 0)
{
lines.release();
@ -155,14 +164,14 @@ namespace
CV_Assert( numangle > 0 && numrho > 0 );
ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
accum_.setTo(Scalar::all(0));
accum_.setTo(Scalar::all(0), stream);
DeviceInfo devInfo;
linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);
ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_);
int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_);
int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_, counterPtr_, cudaStream);
if (linesCount == 0)
{
@ -171,7 +180,7 @@ namespace
}
result_.cols = linesCount;
result_.copyTo(lines);
result_.copyTo(lines, stream);
}
void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)

@ -55,17 +55,17 @@ namespace cv { namespace cuda { namespace device
{
namespace hough
{
int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
}
namespace hough_lines
{
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
}
namespace hough_segments
{
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream);
}
}}}
@ -74,10 +74,8 @@ namespace
class HoughSegmentDetectorImpl : public HoughSegmentDetector
{
public:
HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
{
}
HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines);
~HoughSegmentDetectorImpl();
void detect(InputArray src, OutputArray lines, Stream& stream);
@ -127,8 +125,21 @@ namespace
GpuMat accum_;
GpuMat list_;
GpuMat result_;
int* counterPtr_;
};
HoughSegmentDetectorImpl::HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
{
cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
}
HoughSegmentDetectorImpl::~HoughSegmentDetectorImpl()
{
cudaSafeCall(cudaFree(counterPtr_));
}
void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
{
// TODO : implement async version
@ -138,6 +149,7 @@ namespace
using namespace cv::cuda::device::hough_lines;
using namespace cv::cuda::device::hough_segments;
auto cudaStream = StreamAccessor::getStream(stream);
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8UC1 );
@ -147,7 +159,7 @@ namespace
ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
unsigned int* srcPoints = list_.ptr<unsigned int>();
const int pointsCount = buildPointList_gpu(src, srcPoints);
const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
if (pointsCount == 0)
{
lines.release();
@ -159,14 +171,14 @@ namespace
CV_Assert( numangle > 0 && numrho > 0 );
ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
accum_.setTo(Scalar::all(0));
accum_.setTo(Scalar::all(0), stream);
DeviceInfo devInfo;
linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);
ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_);
int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_);
int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_, counterPtr_, cudaStream);
if (linesCount == 0)
{
@ -175,7 +187,7 @@ namespace
}
result_.cols = linesCount;
result_.copyTo(lines);
result_.copyTo(lines, stream);
}
}

Loading…
Cancel
Save