|
|
|
@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
{ |
|
|
|
|
__shared__ int s_queues[4][32 * PIXELS_PER_THREAD]; |
|
|
|
|
__shared__ int s_qsize[4]; |
|
|
|
|
__shared__ int s_start[4]; |
|
|
|
|
__shared__ int s_globStart[4]; |
|
|
|
|
|
|
|
|
|
const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; |
|
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// fill the queue |
|
|
|
|
const uchar* srcRow = src.ptr(y); |
|
|
|
|
for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x) |
|
|
|
|
{ |
|
|
|
|
if (src(y, xx)) |
|
|
|
|
if (srcRow[xx]) |
|
|
|
|
{ |
|
|
|
|
const unsigned int val = (y << 16) | xx; |
|
|
|
|
const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1); |
|
|
|
@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
if (threadIdx.x == 0 && threadIdx.y == 0) |
|
|
|
|
{ |
|
|
|
|
// find how many items are stored in each list |
|
|
|
|
int total_size = 0; |
|
|
|
|
int totalSize = 0; |
|
|
|
|
for (int i = 0; i < blockDim.y; ++i) |
|
|
|
|
{ |
|
|
|
|
s_start[i] = total_size; |
|
|
|
|
total_size += s_qsize[i]; |
|
|
|
|
s_globStart[i] = totalSize; |
|
|
|
|
totalSize += s_qsize[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// calculate the offset in the global list |
|
|
|
|
const int global_offset = atomicAdd(&g_counter, total_size); |
|
|
|
|
const int globalOffset = atomicAdd(&g_counter, totalSize); |
|
|
|
|
for (int i = 0; i < blockDim.y; ++i) |
|
|
|
|
s_start[i] += global_offset; |
|
|
|
|
s_globStart[i] += globalOffset; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// copy local queues to global queue |
|
|
|
|
const int qsize = s_qsize[threadIdx.y]; |
|
|
|
|
for(int i = threadIdx.x; i < qsize; i += blockDim.x) |
|
|
|
|
{ |
|
|
|
|
const unsigned int val = s_queues[threadIdx.y][i]; |
|
|
|
|
list[s_start[threadIdx.y] + i] = val; |
|
|
|
|
} |
|
|
|
|
int gidx = s_globStart[threadIdx.y] + threadIdx.x; |
|
|
|
|
for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x) |
|
|
|
|
list[gidx] = s_queues[threadIdx.y][i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int buildPointList_gpu(DevMem2Db src, unsigned int* list) |
|
|
|
|
{ |
|
|
|
|
void* counter_ptr; |
|
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); |
|
|
|
|
void* counterPtr; |
|
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); |
|
|
|
|
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); |
|
|
|
|
|
|
|
|
|
const dim3 block(32, 4); |
|
|
|
|
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); |
|
|
|
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
|
|
int total_count; |
|
|
|
|
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
int totalCount; |
|
|
|
|
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
return total_count; |
|
|
|
|
return totalCount; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////// |
|
|
|
@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
const int n = blockIdx.x; |
|
|
|
|
const float ang = n * theta; |
|
|
|
|
|
|
|
|
|
float sin_ang; |
|
|
|
|
float cos_ang; |
|
|
|
|
sincosf(ang, &sin_ang, &cos_ang); |
|
|
|
|
float sinVal; |
|
|
|
|
float cosVal; |
|
|
|
|
sincosf(ang, &sinVal, &cosVal); |
|
|
|
|
sinVal *= irho; |
|
|
|
|
cosVal *= irho; |
|
|
|
|
|
|
|
|
|
const float tabSin = sin_ang * irho; |
|
|
|
|
const float tabCos = cos_ang * irho; |
|
|
|
|
const int shift = (numrho - 1) / 2; |
|
|
|
|
|
|
|
|
|
int* accumRow = accum.ptr(n + 1); |
|
|
|
|
for (int i = threadIdx.x; i < count; i += blockDim.x) |
|
|
|
|
{ |
|
|
|
|
const unsigned int qvalue = list[i]; |
|
|
|
|
const unsigned int val = list[i]; |
|
|
|
|
|
|
|
|
|
const int x = (qvalue & 0x0000FFFF); |
|
|
|
|
const int y = (qvalue >> 16) & 0x0000FFFF; |
|
|
|
|
const int x = (val & 0xFFFF); |
|
|
|
|
const int y = (val >> 16) & 0xFFFF; |
|
|
|
|
|
|
|
|
|
int r = __float2int_rn(x * tabCos + y * tabSin); |
|
|
|
|
r += (numrho - 1) / 2; |
|
|
|
|
int r = __float2int_rn(x * cosVal + y * sinVal); |
|
|
|
|
r += shift; |
|
|
|
|
|
|
|
|
|
::atomicAdd(accum.ptr(n + 1) + r + 1, 1); |
|
|
|
|
::atomicAdd(accumRow + r + 1, 1); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
const int n = blockIdx.x; |
|
|
|
|
const float ang = n * theta; |
|
|
|
|
|
|
|
|
|
float sin_ang; |
|
|
|
|
float cos_ang; |
|
|
|
|
sincosf(ang, &sin_ang, &cos_ang); |
|
|
|
|
float sinVal; |
|
|
|
|
float cosVal; |
|
|
|
|
sincosf(ang, &sinVal, &cosVal); |
|
|
|
|
sinVal *= irho; |
|
|
|
|
cosVal *= irho; |
|
|
|
|
|
|
|
|
|
const float tabSin = sin_ang * irho; |
|
|
|
|
const float tabCos = cos_ang * irho; |
|
|
|
|
const int shift = (numrho - 1) / 2; |
|
|
|
|
|
|
|
|
|
for (int i = threadIdx.x; i < count; i += blockDim.x) |
|
|
|
|
{ |
|
|
|
|
const unsigned int qvalue = list[i]; |
|
|
|
|
const unsigned int val = list[i]; |
|
|
|
|
|
|
|
|
|
const int x = (qvalue & 0x0000FFFF); |
|
|
|
|
const int y = (qvalue >> 16) & 0x0000FFFF; |
|
|
|
|
const int x = (val & 0xFFFF); |
|
|
|
|
const int y = (val >> 16) & 0xFFFF; |
|
|
|
|
|
|
|
|
|
int r = __float2int_rn(x * tabCos + y * tabSin); |
|
|
|
|
r += (numrho - 1) / 2; |
|
|
|
|
int r = __float2int_rn(x * cosVal + y * sinVal); |
|
|
|
|
r += shift; |
|
|
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&smem[r + 1], 1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
for (int i = threadIdx.x; i < numrho; i += blockDim.x) |
|
|
|
|
accum(n + 1, i) = smem[i]; |
|
|
|
|
int* accumRow = accum.ptr(n + 1); |
|
|
|
|
for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x) |
|
|
|
|
accumRow[i] = smem[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20) |
|
|
|
@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
//////////////////////////////////////////////////////////////////////// |
|
|
|
|
// linesGetResult |
|
|
|
|
|
|
|
|
|
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) |
|
|
|
|
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho) |
|
|
|
|
{ |
|
|
|
|
__shared__ int smem[8][32]; |
|
|
|
|
|
|
|
|
|
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; |
|
|
|
|
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; |
|
|
|
|
const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x; |
|
|
|
|
const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y; |
|
|
|
|
|
|
|
|
|
if (r >= accum.cols || n >= accum.rows) |
|
|
|
|
if (x >= accum.cols || y >= accum.rows) |
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
smem[threadIdx.y][threadIdx.x] = accum(n, r); |
|
|
|
|
smem[threadIdx.y][threadIdx.x] = accum(y, x); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
r -= 1; |
|
|
|
|
n -= 1; |
|
|
|
|
const int r = x - 1; |
|
|
|
|
const int n = y - 1; |
|
|
|
|
|
|
|
|
|
if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2) |
|
|
|
|
return; |
|
|
|
@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
|
|
|
|
|
int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort) |
|
|
|
|
{ |
|
|
|
|
void* counter_ptr; |
|
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); |
|
|
|
|
void* counterPtr; |
|
|
|
|
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); |
|
|
|
|
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); |
|
|
|
|
|
|
|
|
|
const dim3 block(32, 8); |
|
|
|
|
const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); |
|
|
|
|
|
|
|
|
|
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2); |
|
|
|
|
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2); |
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
|
|
int total_count; |
|
|
|
|
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
int totalCount; |
|
|
|
|
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
total_count = ::min(total_count, maxSize); |
|
|
|
|
totalCount = ::min(totalCount, maxSize); |
|
|
|
|
|
|
|
|
|
if (doSort && total_count > 0) |
|
|
|
|
if (doSort && totalCount > 0) |
|
|
|
|
{ |
|
|
|
|
thrust::device_ptr<float2> out_ptr(out); |
|
|
|
|
thrust::device_ptr<int> votes_ptr(votes); |
|
|
|
|
thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>()); |
|
|
|
|
thrust::device_ptr<float2> outPtr(out); |
|
|
|
|
thrust::device_ptr<int> votesPtr(votes); |
|
|
|
|
thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return total_count; |
|
|
|
|
return totalCount; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}}} |
|
|
|
|