pull/258/merge
Vladislav Vinogradov 12 years ago
parent e299595667
commit 28716d7f30
  1. 24
      modules/gpu/include/opencv2/gpu/gpu.hpp
  2. 694
      modules/gpu/src/cuda/canny.cu
  3. 107
      modules/gpu/src/imgproc.cpp
  4. 2
      modules/gpu/test/test_imgproc.cpp

@ -792,31 +792,23 @@ private:
GpuMat lab, l, ab; GpuMat lab, l, ab;
}; };
struct CV_EXPORTS CannyBuf;
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
struct CV_EXPORTS CannyBuf struct CV_EXPORTS CannyBuf
{ {
CannyBuf() {}
explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
void create(const Size& image_size, int apperture_size = 3); void create(const Size& image_size, int apperture_size = 3);
void release(); void release();
GpuMat dx, dy; GpuMat dx, dy;
GpuMat dx_buf, dy_buf; GpuMat mag;
GpuMat edgeBuf; GpuMat map;
GpuMat trackBuf1, trackBuf2; GpuMat st1, st2;
Ptr<FilterEngine_GPU> filterDX, filterDY; Ptr<FilterEngine_GPU> filterDX, filterDY;
}; };
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
class CV_EXPORTS ImagePyramid class CV_EXPORTS ImagePyramid
{ {
public: public:

@ -43,459 +43,463 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include <utility> #include <utility>
#include <algorithm> #include "opencv2/gpu/device/common.hpp"
#include "internal_shared.hpp" #include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/utility.hpp"
namespace cv { namespace gpu { namespace device using namespace cv::gpu;
using namespace cv::gpu::device;
namespace
{ {
namespace canny struct L1 : binary_function<int, int, float>
{ {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) __device__ __forceinline__ float operator ()(int x, int y) const
{ {
__shared__ int smem[16][18]; return ::abs(x) + ::abs(y);
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows)
{
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
if (threadIdx.x == 0)
{
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
}
__syncthreads();
if (j < cols)
{
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
}
}
} }
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) __device__ __forceinline__ L1() {}
__device__ __forceinline__ L1(const L1&) {}
};
struct L2 : binary_function<int, int, float>
{
__device__ __forceinline__ float operator ()(int x, int y) const
{ {
dim3 block(16, 16, 1); return ::sqrtf(x * x + y * y);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); }
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols); __device__ __forceinline__ L2() {}
cudaSafeCall( cudaGetLastError() ); __device__ __forceinline__ L2(const L2&) {}
};
}
cudaSafeCall( cudaDeviceSynchronize() ); namespace cv { namespace gpu { namespace device
} {
template <> struct TransformFunctorTraits<L1> : DefaultTransformFunctorTraits<L1>
{
enum { smart_shift = 4 };
};
template <> struct TransformFunctorTraits<L2> : DefaultTransformFunctorTraits<L2>
{
enum { smart_shift = 4 };
};
}}}
struct L1 namespace
{ {
static __device__ __forceinline__ float calc(int x, int y) texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
{ struct SrcTex
return ::abs(x) + ::abs(y); {
} const int xoff;
}; const int yoff;
struct L2 __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::sqrtf(x * x + y * y);
}
};
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, __device__ __forceinline__ int operator ()(int y, int x) const
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{ {
__shared__ int sdx[18][16]; return tex2D(tex_src, x + xoff, y + yoff);
__shared__ int sdy[18][16]; }
};
const int j = blockIdx.x * blockDim.x + threadIdx.x; template <class Norm> __global__
const int i = blockIdx.y * blockDim.y + threadIdx.y; void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (j < cols) if (y >= mag.rows || x >= mag.cols)
{ return;
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
if (threadIdx.y == 0)
{
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
}
__syncthreads();
if (i < rows) dx(y, x) = dxVal;
{ dy(y, x) = dyVal;
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
dx.ptr(i)[j] = x; mag(y, x) = norm(dxVal, dyVal);
dy.ptr(i)[j] = y; }
}
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); namespace canny
} {
} void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
} {
const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) bindTexture(&tex_src, srcWhole);
{ SrcTex src(xoff, yoff);
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad) if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); {
else L2 norm;
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); ::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
if (L2Grad)
{ {
const int j = blockIdx.x * blockDim.x + threadIdx.x; L2 norm;
const int i = blockIdx.y * blockDim.y + threadIdx.y; transform(dx, dy, mag, norm, WithOutMask(), 0);
if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
} }
else
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{ {
dim3 block(16, 16, 1); L1 norm;
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); transform(dx, dy, mag, norm, WithOutMask(), 0);
}
}
}
if (L2Grad) //////////////////////////////////////////////////////////////////////////////////////////
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() ); namespace
{
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
cudaSafeCall( cudaDeviceSynchronize() ); __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
} {
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
////////////////////////////////////////////////////////////////////////////////////////// const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
#define CANNY_SHIFT 15 if (x >= dx.cols || y >= dx.rows)
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) return;
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) int dxVal = dx(y, x);
{ int dyVal = dy(y, x);
__shared__ float smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x; const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const int i = blockIdx.y * 16 + threadIdx.y; const float m = tex2D(tex_mag, x, y);
const int tid = threadIdx.y * 16 + threadIdx.x; dxVal = ::abs(dxVal);
const int lx = tid % 18; dyVal = ::abs(dyVal);
const int ly = tid / 18;
if (ly < 14) // 0 - the pixel can not belong to an edge
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; // 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) if (m > low_thresh)
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; {
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
__syncthreads(); dyVal <<= CANNY_SHIFT;
if (i < rows && j < cols) if (dyVal < tg22x)
{
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{ {
int x = dx.ptr(i)[j]; if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
int y = dy.ptr(i)[j]; edge_type = 1 + (int)(m > high_thresh);
const int s = (x ^ y) < 0 ? -1 : 1; }
const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; else
{
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
}
x = ::abs(x); map(y, x) = edge_type;
y = ::abs(y); }
}
// 0 - the pixel can not belong to an edge namespace canny
// 1 - the pixel might belong to an edge {
// 2 - the pixel does belong to an edge void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
int edge_type = 0; {
const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
if (m > low_thresh) bindTexture(&tex_mag, mag);
{
const int tg22x = x * TG22;
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
y <<= CANNY_SHIFT;
if (y < tg22x)
{
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
edge_type = 1 + (int)(m > high_thresh);
}
else if( y > tg67x )
{
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
edge_type = 1 + (int)(m > high_thresh);
}
}
map.ptr(i + 1)[j + 1] = edge_type; ::calcMap<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
} cudaSafeCall( cudaGetLastError() );
}
#undef CANNY_SHIFT cudaSafeCall( cudaDeviceSynchronize() );
#undef TG22 }
}
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) //////////////////////////////////////////////////////////////////////////////////////////
namespace
{
__device__ int counter = 0;
__global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st)
{
__shared__ volatile int smem[18][18];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
__syncthreads();
if (x >= map.cols || y >= map.rows)
return;
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{ {
dim3 block(16, 16, 1); n = 0;
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
cudaSafeCall( cudaGetLastError() ); {
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
cudaSafeCall( cudaDeviceSynchronize() ); n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
} n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
////////////////////////////////////////////////////////////////////////////////////////// n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
__device__ unsigned int counter = 0; if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
__shared__ int smem[18][18]; map(y, x) = e;
const int j = blockIdx.x * 16 + threadIdx.x; n = 0;
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x; if (e == 2)
const int lx = tid % 18; {
const int ly = tid / 18; n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
if (ly < 14) n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
__syncthreads(); if (n > 0)
{
const int ind = ::atomicAdd(&counter, 1);
st[ind] = make_ushort2(x, y);
}
}
}
if (i < rows && j < cols) namespace canny
{ {
int n; void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
#pragma unroll cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) const dim3 block(16, 16);
{ const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2; ::edgesHysteresisLocal<<<grid, block>>>(map, st1);
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; cudaSafeCall( cudaGetLastError() );
n += smem[threadIdx.y + 2][threadIdx.x ] == 2; cudaSafeCall( cudaDeviceSynchronize() );
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; }
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; }
}
if (n > 0) //////////////////////////////////////////////////////////////////////////////////////////
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; namespace
{
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
map.ptr(i + 1)[j + 1] = e; __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
{
const int stack_size = 512;
n = 0; __shared__ int s_counter;
__shared__ int s_ind;
__shared__ ushort2 s_st[stack_size];
if (e == 2) if (threadIdx.x == 0)
{ s_counter = 0;
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1; __syncthreads();
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1; int ind = blockIdx.y * gridDim.x + blockIdx.x;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0) if (ind >= count)
{ return;
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
st[ind] = make_ushort2(j + 1, i + 1);
}
}
#endif ushort2 pos = st1[ind];
}
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) if (threadIdx.x < 8)
{ {
void* counter_ptr; pos.x += c_dx[threadIdx.x];
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); pos.y += c_dy[threadIdx.x];
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
dim3 block(16, 16, 1); if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); {
map(pos.y, pos.x) = 2;
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols); ind = Emulation::smem::atomicAdd(&s_counter, 1);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); s_st[ind] = pos;
}
} }
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __syncthreads();
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
const int stack_size = 512; if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__shared__ unsigned int s_counter; __syncthreads();
__shared__ unsigned int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0) if (threadIdx.x == 0)
s_counter = 0; s_counter -= portion;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x; __syncthreads();
if (ind < count) if (subTaskIdx < portion)
{ {
ushort2 pos = st1[ind]; pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
{ {
if (threadIdx.x < 8) map(pos.y, pos.x) = 2;
{
pos.x += c_dx[threadIdx.x]; ind = Emulation::smem::atomicAdd(&s_counter, 1);
pos.y += c_dy[threadIdx.x];
s_st[ind] = pos;
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
pos.x = pos.y = 0;
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (threadIdx.x == 0)
s_counter -= portion;
__syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
}
if (s_counter > 0)
{
if (threadIdx.x == 0)
{
ind = atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
{
st2[ind + i] = s_st[i];
}
}
} }
} }
#endif __syncthreads();
} }
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) if (s_counter > 0)
{ {
void* counter_ptr; if (threadIdx.x == 0)
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
while (count > 0)
{ {
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); ind = ::atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
dim3 block(128, 1, 1); }
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); __syncthreads();
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); ind = s_ind;
std::swap(st1, st2); for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
} st2[ind + i] = s_st[i];
} }
}
}
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) namespace canny
{ {
const int j = blockIdx.x * 16 + threadIdx.x; void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
const int i = blockIdx.y * 16 + threadIdx.y; {
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) );
if (i < rows && j < cols) int count;
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
}
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) while (count > 0)
{ {
dim3 block(16, 16, 1); cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
const dim3 block(128);
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
getEdges<<<grid, block>>>(map, dst, rows, cols); ::edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, count);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2);
} }
} // namespace canny }
}}} // namespace cv { namespace gpu { namespace device }
//////////////////////////////////////////////////////////////////////////////////////////
namespace
{
struct GetEdges : unary_function<int, uchar>
{
__device__ __forceinline__ uchar operator ()(int e) const
{
return (uchar)(-(e >> 1));
}
__device__ __forceinline__ GetEdges() {}
__device__ __forceinline__ GetEdges(const GetEdges&) {}
};
}
namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits<GetEdges> : DefaultTransformFunctorTraits<GetEdges>
{
enum { smart_shift = 4 };
};
}}}
namespace canny
{
void getEdges(PtrStepSzi map, PtrStepSzb dst)
{
transform(map, dst, GetEdges(), WithOutMask(), 0);
}
}
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */

@ -91,7 +91,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n
void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); } void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
void cv::gpu::CannyBuf::release() { throw_nogpu(); } void cv::gpu::CannyBuf::release() { throw_nogpu(); }
@ -1466,92 +1465,76 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Canny // Canny
cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
{
CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
create(dx_.size(), -1);
}
void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size) void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
{ {
ensureSizeIsEnough(image_size, CV_32SC1, dx); if (apperture_size > 0)
ensureSizeIsEnough(image_size, CV_32SC1, dy);
if (apperture_size == 3)
{ {
ensureSizeIsEnough(image_size, CV_32SC1, dx_buf); ensureSizeIsEnough(image_size, CV_32SC1, dx);
ensureSizeIsEnough(image_size, CV_32SC1, dy_buf); ensureSizeIsEnough(image_size, CV_32SC1, dy);
}
else if(apperture_size > 0) if (apperture_size != 3)
{ {
if (!filterDX)
filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE); filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
if (!filterDY)
filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE); filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
}
} }
ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf); ensureSizeIsEnough(image_size, CV_32FC1, mag);
ensureSizeIsEnough(image_size, CV_32SC1, map);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
} }
void cv::gpu::CannyBuf::release() void cv::gpu::CannyBuf::release()
{ {
dx.release(); dx.release();
dy.release(); dy.release();
dx_buf.release(); mag.release();
dy_buf.release(); map.release();
edgeBuf.release(); st1.release();
trackBuf1.release(); st2.release();
trackBuf2.release();
} }
namespace cv { namespace gpu { namespace device namespace canny
{ {
namespace canny void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
{ void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh); void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols); void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols); void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols); void getEdges(PtrStepSzi map, PtrStepSzb dst);
} }
}}}
namespace namespace
{ {
void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
{ {
using namespace ::cv::gpu::device::canny; using namespace canny;
calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh); calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols); edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols); edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols); getEdges(buf.map, dst);
} }
} }
void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{ {
CannyBuf buf(src.size(), apperture_size); CannyBuf buf;
Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient); Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
} }
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{ {
using namespace ::cv::gpu::device::canny; using namespace canny;
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
@ -1562,37 +1545,37 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
std::swap( low_thresh, high_thresh); std::swap( low_thresh, high_thresh);
dst.create(src.size(), CV_8U); dst.create(src.size(), CV_8U);
dst.setTo(Scalar::all(0));
buf.create(src.size(), apperture_size); buf.create(src.size(), apperture_size);
buf.edgeBuf.setTo(Scalar::all(0));
if (apperture_size == 3) if (apperture_size == 3)
{ {
calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols); Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
} }
else else
{ {
buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows)); buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows)); buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
} }
CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh)); CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
} }
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{ {
CannyBuf buf(dx, dy); CannyBuf buf;
Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient); Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
} }
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{ {
using namespace ::cv::gpu::device::canny; using namespace canny;
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size()); CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
@ -1601,17 +1584,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
std::swap( low_thresh, high_thresh); std::swap( low_thresh, high_thresh);
dst.create(dx.size(), CV_8U); dst.create(dx.size(), CV_8U);
dst.setTo(Scalar::all(0));
buf.dx = dx; buf.dy = dy;
buf.create(dx.size(), -1); buf.create(dx.size(), -1);
buf.edgeBuf.setTo(Scalar::all(0));
calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient); calcMagnitude(dx, dy, buf.mag, L2gradient);
CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh)); CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

@ -313,7 +313,7 @@ TEST_P(Canny, Accuracy)
cv::Mat edges_gold; cv::Mat edges_gold;
cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient); cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2); EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
} }
} }

Loading…
Cancel
Save