|
|
@ -43,172 +43,151 @@ |
|
|
|
#if !defined CUDA_DISABLER |
|
|
|
#if !defined CUDA_DISABLER |
|
|
|
|
|
|
|
|
|
|
|
#include <utility> |
|
|
|
#include <utility> |
|
|
|
#include <algorithm> |
|
|
|
#include "opencv2/gpu/device/common.hpp" |
|
|
|
#include "internal_shared.hpp" |
|
|
|
#include "opencv2/gpu/device/emulation.hpp" |
|
|
|
|
|
|
|
#include "opencv2/gpu/device/transform.hpp" |
|
|
|
|
|
|
|
#include "opencv2/gpu/device/functional.hpp" |
|
|
|
|
|
|
|
#include "opencv2/gpu/device/utility.hpp" |
|
|
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace device |
|
|
|
using namespace cv::gpu; |
|
|
|
{ |
|
|
|
using namespace cv::gpu::device; |
|
|
|
namespace canny |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__shared__ int smem[16][18]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int j = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
namespace |
|
|
|
const int i = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
{ |
|
|
|
|
|
|
|
struct L1 : binary_function<int, int, float> |
|
|
|
if (i < rows) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; |
|
|
|
__device__ __forceinline__ float operator ()(int x, int y) const |
|
|
|
if (threadIdx.x == 0) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; |
|
|
|
return ::abs(x) + ::abs(y); |
|
|
|
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (j < cols) |
|
|
|
__device__ __forceinline__ L1() {} |
|
|
|
|
|
|
|
__device__ __forceinline__ L1(const L1&) {} |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
struct L2 : binary_function<int, int, float> |
|
|
|
{ |
|
|
|
{ |
|
|
|
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; |
|
|
|
__device__ __forceinline__ float operator ()(int x, int y) const |
|
|
|
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
dim3 block(16, 16, 1); |
|
|
|
return ::sqrtf(x * x + y * y); |
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols); |
|
|
|
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
struct L1 |
|
|
|
__device__ __forceinline__ L2() {} |
|
|
|
{ |
|
|
|
__device__ __forceinline__ L2(const L2&) {} |
|
|
|
static __device__ __forceinline__ float calc(int x, int y) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
return ::abs(x) + ::abs(y); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}; |
|
|
|
}; |
|
|
|
struct L2 |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
template <> struct TransformFunctorTraits<L1> : DefaultTransformFunctorTraits<L1> |
|
|
|
{ |
|
|
|
{ |
|
|
|
static __device__ __forceinline__ float calc(int x, int y) |
|
|
|
enum { smart_shift = 4 }; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
template <> struct TransformFunctorTraits<L2> : DefaultTransformFunctorTraits<L2> |
|
|
|
{ |
|
|
|
{ |
|
|
|
return ::sqrtf(x * x + y * y); |
|
|
|
enum { smart_shift = 4 }; |
|
|
|
} |
|
|
|
|
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
}}} |
|
|
|
|
|
|
|
|
|
|
|
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, |
|
|
|
namespace |
|
|
|
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) |
|
|
|
{ |
|
|
|
|
|
|
|
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); |
|
|
|
|
|
|
|
struct SrcTex |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ int sdx[18][16]; |
|
|
|
const int xoff; |
|
|
|
__shared__ int sdy[18][16]; |
|
|
|
const int yoff; |
|
|
|
|
|
|
|
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} |
|
|
|
const int j = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
const int i = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (j < cols) |
|
|
|
__device__ __forceinline__ int operator ()(int y, int x) const |
|
|
|
{ |
|
|
|
|
|
|
|
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; |
|
|
|
|
|
|
|
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; |
|
|
|
|
|
|
|
if (threadIdx.y == 0) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; |
|
|
|
return tex2D(tex_src, x + xoff, y + yoff); |
|
|
|
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; |
|
|
|
|
|
|
|
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
if (i < rows) |
|
|
|
template <class Norm> __global__ |
|
|
|
|
|
|
|
void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; |
|
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; |
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
dx.ptr(i)[j] = x; |
|
|
|
if (y >= mag.rows || x >= mag.cols) |
|
|
|
dy.ptr(i)[j] = y; |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); |
|
|
|
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); |
|
|
|
} |
|
|
|
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
dx(y, x) = dxVal; |
|
|
|
|
|
|
|
dy(y, x) = dyVal; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mag(y, x) = norm(dxVal, dyVal); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) |
|
|
|
namespace canny |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) |
|
|
|
{ |
|
|
|
{ |
|
|
|
dim3 block(16, 16, 1); |
|
|
|
const dim3 block(16, 16); |
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bindTexture(&tex_src, srcWhole); |
|
|
|
|
|
|
|
SrcTex src(xoff, yoff); |
|
|
|
|
|
|
|
|
|
|
|
if (L2Grad) |
|
|
|
if (L2Grad) |
|
|
|
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); |
|
|
|
{ |
|
|
|
|
|
|
|
L2 norm; |
|
|
|
|
|
|
|
::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm); |
|
|
|
|
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); |
|
|
|
{ |
|
|
|
|
|
|
|
L1 norm; |
|
|
|
|
|
|
|
::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall(cudaThreadSynchronize()); |
|
|
|
cudaSafeCall(cudaThreadSynchronize()); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) |
|
|
|
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int j = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
const int i = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (i < rows && j < cols) |
|
|
|
|
|
|
|
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
dim3 block(16, 16, 1); |
|
|
|
|
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (L2Grad) |
|
|
|
if (L2Grad) |
|
|
|
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols); |
|
|
|
{ |
|
|
|
|
|
|
|
L2 norm; |
|
|
|
|
|
|
|
transform(dx, dy, mag, norm, WithOutMask(), 0); |
|
|
|
|
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols); |
|
|
|
{ |
|
|
|
|
|
|
|
L1 norm; |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
transform(dx, dy, mag, norm, WithOutMask(), 0); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
|
|
|
|
|
|
|
#define CANNY_SHIFT 15 |
|
|
|
namespace |
|
|
|
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) |
|
|
|
{ |
|
|
|
|
|
|
|
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); |
|
|
|
|
|
|
|
|
|
|
|
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) |
|
|
|
__global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ float smem[18][18]; |
|
|
|
const int CANNY_SHIFT = 15; |
|
|
|
|
|
|
|
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); |
|
|
|
const int j = blockIdx.x * 16 + threadIdx.x; |
|
|
|
|
|
|
|
const int i = blockIdx.y * 16 + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * 16 + threadIdx.x; |
|
|
|
|
|
|
|
const int lx = tid % 18; |
|
|
|
|
|
|
|
const int ly = tid / 18; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (ly < 14) |
|
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; |
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) |
|
|
|
if (x >= dx.cols || y >= dx.rows) |
|
|
|
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
int dxVal = dx(y, x); |
|
|
|
|
|
|
|
int dyVal = dy(y, x); |
|
|
|
|
|
|
|
|
|
|
|
if (i < rows && j < cols) |
|
|
|
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; |
|
|
|
{ |
|
|
|
const float m = tex2D(tex_mag, x, y); |
|
|
|
int x = dx.ptr(i)[j]; |
|
|
|
|
|
|
|
int y = dy.ptr(i)[j]; |
|
|
|
|
|
|
|
const int s = (x ^ y) < 0 ? -1 : 1; |
|
|
|
|
|
|
|
const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
x = ::abs(x); |
|
|
|
dxVal = ::abs(dxVal); |
|
|
|
y = ::abs(y); |
|
|
|
dyVal = ::abs(dyVal); |
|
|
|
|
|
|
|
|
|
|
|
// 0 - the pixel can not belong to an edge |
|
|
|
// 0 - the pixel can not belong to an edge |
|
|
|
// 1 - the pixel might belong to an edge |
|
|
|
// 1 - the pixel might belong to an edge |
|
|
@ -217,73 +196,84 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
|
|
if (m > low_thresh) |
|
|
|
if (m > low_thresh) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int tg22x = x * TG22; |
|
|
|
const int tg22x = dxVal * TG22; |
|
|
|
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); |
|
|
|
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); |
|
|
|
|
|
|
|
|
|
|
|
y <<= CANNY_SHIFT; |
|
|
|
dyVal <<= CANNY_SHIFT; |
|
|
|
|
|
|
|
|
|
|
|
if (y < tg22x) |
|
|
|
if (dyVal < tg22x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) |
|
|
|
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
} |
|
|
|
} |
|
|
|
else if( y > tg67x ) |
|
|
|
else if(dyVal > tg67x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) |
|
|
|
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) |
|
|
|
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
edge_type = 1 + (int)(m > high_thresh); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
map.ptr(i + 1)[j + 1] = edge_type; |
|
|
|
map(y, x) = edge_type; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#undef CANNY_SHIFT |
|
|
|
|
|
|
|
#undef TG22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) |
|
|
|
namespace canny |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) |
|
|
|
{ |
|
|
|
{ |
|
|
|
dim3 block(16, 16, 1); |
|
|
|
const dim3 block(16, 16); |
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bindTexture(&tex_mag, mag); |
|
|
|
|
|
|
|
|
|
|
|
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); |
|
|
|
::calcMap<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
|
|
|
|
|
|
|
__device__ unsigned int counter = 0; |
|
|
|
namespace |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__device__ int counter = 0; |
|
|
|
|
|
|
|
|
|
|
|
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) |
|
|
|
__global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) |
|
|
|
__shared__ volatile int smem[18][18]; |
|
|
|
|
|
|
|
|
|
|
|
__shared__ int smem[18][18]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int j = blockIdx.x * 16 + threadIdx.x; |
|
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
const int i = blockIdx.y * 16 + threadIdx.y; |
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * 16 + threadIdx.x; |
|
|
|
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; |
|
|
|
const int lx = tid % 18; |
|
|
|
if (threadIdx.y == 0) |
|
|
|
const int ly = tid / 18; |
|
|
|
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; |
|
|
|
|
|
|
|
if (threadIdx.y == blockDim.y - 1) |
|
|
|
if (ly < 14) |
|
|
|
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; |
|
|
|
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; |
|
|
|
if (threadIdx.x == 0) |
|
|
|
|
|
|
|
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; |
|
|
|
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) |
|
|
|
if (threadIdx.x == blockDim.x - 1) |
|
|
|
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; |
|
|
|
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; |
|
|
|
|
|
|
|
if (threadIdx.x == 0 && threadIdx.y == 0) |
|
|
|
|
|
|
|
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; |
|
|
|
|
|
|
|
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) |
|
|
|
|
|
|
|
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; |
|
|
|
|
|
|
|
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) |
|
|
|
|
|
|
|
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; |
|
|
|
|
|
|
|
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) |
|
|
|
|
|
|
|
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (i < rows && j < cols) |
|
|
|
if (x >= map.cols || y >= map.rows) |
|
|
|
{ |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
int n; |
|
|
|
int n; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
#pragma unroll |
|
|
@ -311,7 +301,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
|
|
const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; |
|
|
|
const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; |
|
|
|
|
|
|
|
|
|
|
|
map.ptr(i + 1)[j + 1] = e; |
|
|
|
map(y, x) = e; |
|
|
|
|
|
|
|
|
|
|
|
n = 0; |
|
|
|
n = 0; |
|
|
|
|
|
|
|
|
|
|
@ -331,69 +321,73 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
|
|
if (n > 0) |
|
|
|
if (n > 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); |
|
|
|
const int ind = ::atomicAdd(&counter, 1); |
|
|
|
st[ind] = make_ushort2(j + 1, i + 1); |
|
|
|
st[ind] = make_ushort2(x, y); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
namespace canny |
|
|
|
} |
|
|
|
{ |
|
|
|
|
|
|
|
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) |
|
|
|
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
void* counter_ptr; |
|
|
|
void* counter_ptr; |
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); |
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); |
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); |
|
|
|
|
|
|
|
|
|
|
|
dim3 block(16, 16, 1); |
|
|
|
const dim3 block(16, 16); |
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); |
|
|
|
|
|
|
|
|
|
|
|
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols); |
|
|
|
::edgesHysteresisLocal<<<grid, block>>>(map, st1); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace |
|
|
|
|
|
|
|
{ |
|
|
|
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; |
|
|
|
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; |
|
|
|
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; |
|
|
|
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; |
|
|
|
|
|
|
|
|
|
|
|
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) |
|
|
|
__global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int stack_size = 512; |
|
|
|
const int stack_size = 512; |
|
|
|
|
|
|
|
|
|
|
|
__shared__ unsigned int s_counter; |
|
|
|
__shared__ int s_counter; |
|
|
|
__shared__ unsigned int s_ind; |
|
|
|
__shared__ int s_ind; |
|
|
|
__shared__ ushort2 s_st[stack_size]; |
|
|
|
__shared__ ushort2 s_st[stack_size]; |
|
|
|
|
|
|
|
|
|
|
|
if (threadIdx.x == 0) |
|
|
|
if (threadIdx.x == 0) |
|
|
|
s_counter = 0; |
|
|
|
s_counter = 0; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
int ind = blockIdx.y * gridDim.x + blockIdx.x; |
|
|
|
int ind = blockIdx.y * gridDim.x + blockIdx.x; |
|
|
|
|
|
|
|
|
|
|
|
if (ind < count) |
|
|
|
if (ind >= count) |
|
|
|
{ |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
ushort2 pos = st1[ind]; |
|
|
|
ushort2 pos = st1[ind]; |
|
|
|
|
|
|
|
|
|
|
|
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (threadIdx.x < 8) |
|
|
|
if (threadIdx.x < 8) |
|
|
|
{ |
|
|
|
{ |
|
|
|
pos.x += c_dx[threadIdx.x]; |
|
|
|
pos.x += c_dx[threadIdx.x]; |
|
|
|
pos.y += c_dy[threadIdx.x]; |
|
|
|
pos.y += c_dy[threadIdx.x]; |
|
|
|
|
|
|
|
|
|
|
|
if (map.ptr(pos.y)[pos.x] == 1) |
|
|
|
if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) |
|
|
|
{ |
|
|
|
{ |
|
|
|
map.ptr(pos.y)[pos.x] = 2; |
|
|
|
map(pos.y, pos.x) = 2; |
|
|
|
|
|
|
|
|
|
|
|
ind = atomicInc(&s_counter, (unsigned int)(-1)); |
|
|
|
ind = Emulation::smem::atomicAdd(&s_counter, 1); |
|
|
|
|
|
|
|
|
|
|
|
s_st[ind] = pos; |
|
|
|
s_st[ind] = pos; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
while (s_counter > 0 && s_counter <= stack_size - blockDim.x) |
|
|
|
while (s_counter > 0 && s_counter <= stack_size - blockDim.x) |
|
|
@ -401,30 +395,31 @@ namespace cv { namespace gpu { namespace device |
|
|
|
const int subTaskIdx = threadIdx.x >> 3; |
|
|
|
const int subTaskIdx = threadIdx.x >> 3; |
|
|
|
const int portion = ::min(s_counter, blockDim.x >> 3); |
|
|
|
const int portion = ::min(s_counter, blockDim.x >> 3); |
|
|
|
|
|
|
|
|
|
|
|
pos.x = pos.y = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (subTaskIdx < portion) |
|
|
|
if (subTaskIdx < portion) |
|
|
|
pos = s_st[s_counter - 1 - subTaskIdx]; |
|
|
|
pos = s_st[s_counter - 1 - subTaskIdx]; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (threadIdx.x == 0) |
|
|
|
if (threadIdx.x == 0) |
|
|
|
s_counter -= portion; |
|
|
|
s_counter -= portion; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) |
|
|
|
if (subTaskIdx < portion) |
|
|
|
{ |
|
|
|
{ |
|
|
|
pos.x += c_dx[threadIdx.x & 7]; |
|
|
|
pos.x += c_dx[threadIdx.x & 7]; |
|
|
|
pos.y += c_dy[threadIdx.x & 7]; |
|
|
|
pos.y += c_dy[threadIdx.x & 7]; |
|
|
|
|
|
|
|
|
|
|
|
if (map.ptr(pos.y)[pos.x] == 1) |
|
|
|
if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) |
|
|
|
{ |
|
|
|
{ |
|
|
|
map.ptr(pos.y)[pos.x] = 2; |
|
|
|
map(pos.y, pos.x) = 2; |
|
|
|
|
|
|
|
|
|
|
|
ind = atomicInc(&s_counter, (unsigned int)(-1)); |
|
|
|
ind = Emulation::smem::atomicAdd(&s_counter, 1); |
|
|
|
|
|
|
|
|
|
|
|
s_st[ind] = pos; |
|
|
|
s_st[ind] = pos; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -432,70 +427,79 @@ namespace cv { namespace gpu { namespace device |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (threadIdx.x == 0) |
|
|
|
if (threadIdx.x == 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
ind = atomicAdd(&counter, s_counter); |
|
|
|
ind = ::atomicAdd(&counter, s_counter); |
|
|
|
s_ind = ind - s_counter; |
|
|
|
s_ind = ind - s_counter; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
ind = s_ind; |
|
|
|
ind = s_ind; |
|
|
|
|
|
|
|
|
|
|
|
for (int i = threadIdx.x; i < s_counter; i += blockDim.x) |
|
|
|
for (int i = threadIdx.x; i < s_counter; i += blockDim.x) |
|
|
|
{ |
|
|
|
|
|
|
|
st2[ind + i] = s_st[i]; |
|
|
|
st2[ind + i] = s_st[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) |
|
|
|
namespace canny |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
void* counter_ptr; |
|
|
|
void* counter_ptr; |
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); |
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) ); |
|
|
|
|
|
|
|
|
|
|
|
unsigned int count; |
|
|
|
int count; |
|
|
|
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); |
|
|
|
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
|
|
while (count > 0) |
|
|
|
while (count > 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); |
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const dim3 block(128); |
|
|
|
|
|
|
|
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); |
|
|
|
|
|
|
|
|
|
|
|
dim3 block(128, 1, 1); |
|
|
|
::edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, count); |
|
|
|
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); |
|
|
|
|
|
|
|
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count); |
|
|
|
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); |
|
|
|
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
|
|
std::swap(st1, st2); |
|
|
|
std::swap(st1, st2); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) |
|
|
|
////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
{ |
|
|
|
|
|
|
|
const int j = blockIdx.x * 16 + threadIdx.x; |
|
|
|
|
|
|
|
const int i = blockIdx.y * 16 + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (i < rows && j < cols) |
|
|
|
namespace |
|
|
|
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); |
|
|
|
{ |
|
|
|
|
|
|
|
struct GetEdges : unary_function<int, uchar> |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__device__ __forceinline__ uchar operator ()(int e) const |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
return (uchar)(-(e >> 1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) |
|
|
|
__device__ __forceinline__ GetEdges() {} |
|
|
|
{ |
|
|
|
__device__ __forceinline__ GetEdges(const GetEdges&) {} |
|
|
|
dim3 block(16, 16, 1); |
|
|
|
}; |
|
|
|
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
getEdges<<<grid, block>>>(map, dst, rows, cols); |
|
|
|
namespace cv { namespace gpu { namespace device |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
{ |
|
|
|
|
|
|
|
template <> struct TransformFunctorTraits<GetEdges> : DefaultTransformFunctorTraits<GetEdges> |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
enum { smart_shift = 4 }; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
}}} |
|
|
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
namespace canny |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
void getEdges(PtrStepSzi map, PtrStepSzb dst) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
transform(map, dst, GetEdges(), WithOutMask(), 0); |
|
|
|
} |
|
|
|
} |
|
|
|
} // namespace canny |
|
|
|
} |
|
|
|
}}} // namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif /* CUDA_DISABLER */ |
|
|
|
#endif /* CUDA_DISABLER */ |