|
|
|
@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
__constant__ float c_dxy_scale; |
|
|
|
|
// The scale associated with the first interval of the first octave |
|
|
|
|
__constant__ float c_initialScale; |
|
|
|
|
//! The interest operator threshold |
|
|
|
|
// The interest operator threshold |
|
|
|
|
__constant__ float c_threshold; |
|
|
|
|
|
|
|
|
|
// Ther octave |
|
|
|
@ -685,7 +685,6 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
// - SURF says to only use a circle, but the branching logic would slow it down |
|
|
|
|
// - Gaussian weighting should reduce the effects of the outer points anyway |
|
|
|
|
if (tid2 < 169) |
|
|
|
|
|
|
|
|
|
{ |
|
|
|
|
dx -= texLookups[threadIdx.x ][threadIdx.y ]; |
|
|
|
|
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; |
|
|
|
@ -835,24 +834,12 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
descriptor_base[threadIdx.x] = lookup / len; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features) |
|
|
|
|
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt, |
|
|
|
|
int xIndex, int yIndex, int tid) |
|
|
|
|
{ |
|
|
|
|
// get the interest point parameters (x, y, size, response, angle) |
|
|
|
|
__shared__ float ipt[5]; |
|
|
|
|
if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0) |
|
|
|
|
{ |
|
|
|
|
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float sin_theta, cos_theta; |
|
|
|
|
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta); |
|
|
|
|
|
|
|
|
|
// Compute sampling points |
|
|
|
|
// since grids are 2D, need to compute xBlock and yBlock indices |
|
|
|
|
const int xIndex = threadIdx.y * 5 + threadIdx.x % 5; |
|
|
|
|
const int yIndex = threadIdx.z * 5 + threadIdx.x / 5; |
|
|
|
|
|
|
|
|
|
// Compute rotated sampling points |
|
|
|
|
// (clockwise rotation since we are rotating the lattice) |
|
|
|
|
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel) |
|
|
|
@ -865,7 +852,6 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
// a b c |
|
|
|
|
// d f |
|
|
|
|
// g h i |
|
|
|
|
|
|
|
|
|
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); |
|
|
|
|
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]); |
|
|
|
|
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); |
|
|
|
@ -883,53 +869,64 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
|
|
|
|
|
// rotate responses (store all dxs then all dys) |
|
|
|
|
// - counterclockwise rotation to rotate back to zero orientation |
|
|
|
|
sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx |
|
|
|
|
sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy |
|
|
|
|
sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx |
|
|
|
|
sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25], |
|
|
|
|
float sdata4[4][4][25]) |
|
|
|
|
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features) |
|
|
|
|
{ |
|
|
|
|
// first step is to reduce from 25 to 16 |
|
|
|
|
if (threadIdx.x < 9) // use 9 threads |
|
|
|
|
// get the interest point parameters (x, y, size, response, angle) |
|
|
|
|
__shared__ float ipt[5]; |
|
|
|
|
if (threadIdx.x < 5 && threadIdx.y == 0) |
|
|
|
|
{ |
|
|
|
|
sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16]; |
|
|
|
|
sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16]; |
|
|
|
|
sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16]; |
|
|
|
|
sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16]; |
|
|
|
|
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) |
|
|
|
|
if (threadIdx.x < 16) |
|
|
|
|
{ |
|
|
|
|
volatile float* smem = sdata1[threadIdx.z][threadIdx.y]; |
|
|
|
|
|
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 8]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 4]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 2]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 1]; |
|
|
|
|
// Compute sampling points |
|
|
|
|
// since grids are 2D, need to compute xBlock and yBlock indices |
|
|
|
|
const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4 |
|
|
|
|
const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4) |
|
|
|
|
const int xIndex = (xBlock * 5) + (threadIdx.x % 5); |
|
|
|
|
const int yIndex = (yBlock * 5) + (threadIdx.x / 5); |
|
|
|
|
|
|
|
|
|
smem = sdata2[threadIdx.z][threadIdx.y]; |
|
|
|
|
calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 8]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 4]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 2]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 1]; |
|
|
|
|
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, |
|
|
|
|
volatile float* sdata3, volatile float* sdata4, int tid) |
|
|
|
|
{ |
|
|
|
|
// first step is to reduce from 25 to 16 |
|
|
|
|
if (tid < 9) // use 9 threads |
|
|
|
|
{ |
|
|
|
|
sdata1[tid] += sdata1[tid + 16]; |
|
|
|
|
sdata2[tid] += sdata2[tid + 16]; |
|
|
|
|
sdata3[tid] += sdata3[tid + 16]; |
|
|
|
|
sdata4[tid] += sdata4[tid + 16]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
smem = sdata3[threadIdx.z][threadIdx.y]; |
|
|
|
|
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) |
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
sdata1[tid] += sdata1[tid + 8]; |
|
|
|
|
sdata1[tid] += sdata1[tid + 4]; |
|
|
|
|
sdata1[tid] += sdata1[tid + 2]; |
|
|
|
|
sdata1[tid] += sdata1[tid + 1]; |
|
|
|
|
|
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 8]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 4]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 2]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 1]; |
|
|
|
|
sdata2[tid] += sdata2[tid + 8]; |
|
|
|
|
sdata2[tid] += sdata2[tid + 4]; |
|
|
|
|
sdata2[tid] += sdata2[tid + 2]; |
|
|
|
|
sdata2[tid] += sdata2[tid + 1]; |
|
|
|
|
|
|
|
|
|
smem = sdata4[threadIdx.z][threadIdx.y]; |
|
|
|
|
sdata3[tid] += sdata3[tid + 8]; |
|
|
|
|
sdata3[tid] += sdata3[tid + 4]; |
|
|
|
|
sdata3[tid] += sdata3[tid + 2]; |
|
|
|
|
sdata3[tid] += sdata3[tid + 1]; |
|
|
|
|
|
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 8]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 4]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 2]; |
|
|
|
|
smem[threadIdx.x] += smem[threadIdx.x + 1]; |
|
|
|
|
sdata4[tid] += sdata4[tid + 8]; |
|
|
|
|
sdata4[tid] += sdata4[tid + 4]; |
|
|
|
|
sdata4[tid] += sdata4[tid + 2]; |
|
|
|
|
sdata4[tid] += sdata4[tid + 1]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -938,31 +935,43 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) |
|
|
|
|
{ |
|
|
|
|
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region) |
|
|
|
|
__shared__ float sdx[4][4][25]; |
|
|
|
|
__shared__ float sdy[4][4][25]; |
|
|
|
|
__shared__ float sdx [16 * 25]; |
|
|
|
|
__shared__ float sdy [16 * 25]; |
|
|
|
|
__shared__ float sdxabs[16 * 25]; |
|
|
|
|
__shared__ float sdyabs[16 * 25]; |
|
|
|
|
|
|
|
|
|
calc_dx_dy(sdx, sdy, features); |
|
|
|
|
__syncthreads(); |
|
|
|
|
__shared__ float sdesc[64]; |
|
|
|
|
|
|
|
|
|
__shared__ float sdxabs[4][4][25]; |
|
|
|
|
__shared__ float sdyabs[4][4][25]; |
|
|
|
|
float* sdx_bin = sdx + (threadIdx.y * 25); |
|
|
|
|
float* sdy_bin = sdy + (threadIdx.y * 25); |
|
|
|
|
float* sdxabs_bin = sdxabs + (threadIdx.y * 25); |
|
|
|
|
float* sdyabs_bin = sdyabs + (threadIdx.y * 25); |
|
|
|
|
|
|
|
|
|
sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array |
|
|
|
|
sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array |
|
|
|
|
calc_dx_dy(sdx_bin, sdy_bin, features); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum(sdx, sdy, sdxabs, sdyabs); |
|
|
|
|
sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array |
|
|
|
|
sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4; |
|
|
|
|
reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float* sdesc_bin = sdesc + (threadIdx.y << 2); |
|
|
|
|
|
|
|
|
|
// write dx, dy, |dx|, |dy| |
|
|
|
|
if (threadIdx.x == 0) |
|
|
|
|
{ |
|
|
|
|
descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
sdesc_bin[0] = sdx_bin[0]; |
|
|
|
|
sdesc_bin[1] = sdy_bin[0]; |
|
|
|
|
sdesc_bin[2] = sdxabs_bin[0]; |
|
|
|
|
sdesc_bin[3] = sdyabs_bin[0]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
if (tid < 64) |
|
|
|
|
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spawn 16 blocks per interest point |
|
|
|
@ -970,74 +979,90 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) |
|
|
|
|
{ |
|
|
|
|
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) |
|
|
|
|
__shared__ float sdx[4][4][25]; |
|
|
|
|
__shared__ float sdy[4][4][25]; |
|
|
|
|
|
|
|
|
|
calc_dx_dy(sdx, sdy, features); |
|
|
|
|
__syncthreads(); |
|
|
|
|
__shared__ float sdx[16 * 25]; |
|
|
|
|
__shared__ float sdy[16 * 25]; |
|
|
|
|
|
|
|
|
|
// sum (reduce) 5x5 area response |
|
|
|
|
__shared__ float sd1[4][4][25]; |
|
|
|
|
__shared__ float sd2[4][4][25]; |
|
|
|
|
__shared__ float sdabs1[4][4][25]; |
|
|
|
|
__shared__ float sdabs2[4][4][25]; |
|
|
|
|
__shared__ float sd1[16 * 25]; |
|
|
|
|
__shared__ float sd2[16 * 25]; |
|
|
|
|
__shared__ float sdabs1[16 * 25]; |
|
|
|
|
__shared__ float sdabs2[16 * 25]; |
|
|
|
|
|
|
|
|
|
__shared__ float sdesc[128]; |
|
|
|
|
|
|
|
|
|
float* sdx_bin = sdx + (threadIdx.y * 25); |
|
|
|
|
float* sdy_bin = sdy + (threadIdx.y * 25); |
|
|
|
|
float* sd1_bin = sd1 + (threadIdx.y * 25); |
|
|
|
|
float* sd2_bin = sd2 + (threadIdx.y * 25); |
|
|
|
|
float* sdabs1_bin = sdabs1 + (threadIdx.y * 25); |
|
|
|
|
float* sdabs2_bin = sdabs2 + (threadIdx.y * 25); |
|
|
|
|
|
|
|
|
|
calc_dx_dy(sdx_bin, sdy_bin, features); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0) |
|
|
|
|
if (sdy_bin[threadIdx.x] >= 0) |
|
|
|
|
{ |
|
|
|
|
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x]; |
|
|
|
|
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); |
|
|
|
|
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x]; |
|
|
|
|
sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); |
|
|
|
|
sd2_bin[threadIdx.x] = 0; |
|
|
|
|
sdabs2_bin[threadIdx.x] = 0; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x]; |
|
|
|
|
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); |
|
|
|
|
sd1_bin[threadIdx.x] = 0; |
|
|
|
|
sdabs1_bin[threadIdx.x] = 0; |
|
|
|
|
sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x]; |
|
|
|
|
sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]); |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum(sd1, sd2, sdabs1, sdabs2); |
|
|
|
|
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8; |
|
|
|
|
float* sdesc_bin = sdesc + (threadIdx.y << 3); |
|
|
|
|
|
|
|
|
|
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) |
|
|
|
|
if (threadIdx.x == 0) |
|
|
|
|
{ |
|
|
|
|
descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
sdesc_bin[0] = sd1_bin[0]; |
|
|
|
|
sdesc_bin[1] = sdabs1_bin[0]; |
|
|
|
|
sdesc_bin[2] = sd2_bin[0]; |
|
|
|
|
sdesc_bin[3] = sdabs2_bin[0]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0) |
|
|
|
|
if (sdx_bin[threadIdx.x] >= 0) |
|
|
|
|
{ |
|
|
|
|
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x]; |
|
|
|
|
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); |
|
|
|
|
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x]; |
|
|
|
|
sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); |
|
|
|
|
sd2_bin[threadIdx.x] = 0; |
|
|
|
|
sdabs2_bin[threadIdx.x] = 0; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; |
|
|
|
|
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x]; |
|
|
|
|
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); |
|
|
|
|
sd1_bin[threadIdx.x] = 0; |
|
|
|
|
sdabs1_bin[threadIdx.x] = 0; |
|
|
|
|
sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x]; |
|
|
|
|
sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum(sd1, sd2, sdabs1, sdabs2); |
|
|
|
|
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) |
|
|
|
|
if (threadIdx.x == 0) |
|
|
|
|
{ |
|
|
|
|
descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0]; |
|
|
|
|
sdesc_bin[4] = sd1_bin[0]; |
|
|
|
|
sdesc_bin[5] = sdabs1_bin[0]; |
|
|
|
|
sdesc_bin[6] = sd2_bin[0]; |
|
|
|
|
sdesc_bin[7] = sdabs2_bin[0]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
if (tid < 128) |
|
|
|
|
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) |
|
|
|
@ -1046,7 +1071,7 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
|
|
|
|
|
if (descriptors.cols == 64) |
|
|
|
|
{ |
|
|
|
|
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features); |
|
|
|
|
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features); |
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaThreadSynchronize() ); |
|
|
|
@ -1058,7 +1083,7 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features); |
|
|
|
|
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features); |
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaThreadSynchronize() ); |
|
|
|
@ -1080,9 +1105,6 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float sin_theta, cos_theta; |
|
|
|
|
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta); |
|
|
|
|
|
|
|
|
|
// Compute sampling points |
|
|
|
|
// since grids are 2D, need to compute xBlock and yBlock indices |
|
|
|
|
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4 |
|
|
|
@ -1090,100 +1112,40 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
const int xIndex = xBlock * blockDim.x + threadIdx.x; |
|
|
|
|
const int yIndex = yBlock * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
// Compute rotated sampling points |
|
|
|
|
// (clockwise rotation since we are rotating the lattice) |
|
|
|
|
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel) |
|
|
|
|
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] |
|
|
|
|
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); |
|
|
|
|
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] |
|
|
|
|
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); |
|
|
|
|
|
|
|
|
|
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy) |
|
|
|
|
// a b c |
|
|
|
|
// d f |
|
|
|
|
// g h i |
|
|
|
|
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); |
|
|
|
|
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]); |
|
|
|
|
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); |
|
|
|
|
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y); |
|
|
|
|
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y); |
|
|
|
|
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); |
|
|
|
|
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]); |
|
|
|
|
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); |
|
|
|
|
|
|
|
|
|
// compute axis-aligned HaarX, HaarY |
|
|
|
|
// (could group the additions together into multiplications) |
|
|
|
|
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular) |
|
|
|
|
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx |
|
|
|
|
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy |
|
|
|
|
|
|
|
|
|
// rotate responses (store all dxs then all dys) |
|
|
|
|
// - counterclockwise rotation to rotate back to zero orientation |
|
|
|
|
sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx |
|
|
|
|
sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__device__ void reduce_sum_old(float sdata[25], int tid) |
|
|
|
|
{ |
|
|
|
|
// first step is to reduce from 25 to 16 |
|
|
|
|
if (tid < 9) // use 9 threads |
|
|
|
|
sdata[tid] += sdata[tid + 16]; |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) |
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
volatile float* smem = sdata; |
|
|
|
|
|
|
|
|
|
smem[tid] += smem[tid + 8]; |
|
|
|
|
smem[tid] += smem[tid + 4]; |
|
|
|
|
smem[tid] += smem[tid + 2]; |
|
|
|
|
smem[tid] += smem[tid + 1]; |
|
|
|
|
} |
|
|
|
|
calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spawn 16 blocks per interest point |
|
|
|
|
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location |
|
|
|
|
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features) |
|
|
|
|
{ |
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2); |
|
|
|
|
|
|
|
|
|
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) |
|
|
|
|
__shared__ float sdx[25]; |
|
|
|
|
__shared__ float sdy[25]; |
|
|
|
|
__shared__ float sdxabs[25]; |
|
|
|
|
__shared__ float sdyabs[25]; |
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
calc_dx_dy_old(sdx, sdy, features, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
__shared__ float sabs[25]; |
|
|
|
|
sdxabs[tid] = fabs(sdx[tid]); // |dx| array |
|
|
|
|
sdyabs[tid] = fabs(sdy[tid]); // |dy| array |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
sabs[tid] = fabs(sdx[tid]); // |dx| array |
|
|
|
|
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum_old(sdx, tid); |
|
|
|
|
reduce_sum_old(sdy, tid); |
|
|
|
|
reduce_sum_old(sabs, tid); |
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2); |
|
|
|
|
|
|
|
|
|
// write dx, dy, |dx| |
|
|
|
|
// write dx, dy, |dx|, |dy| |
|
|
|
|
if (tid == 0) |
|
|
|
|
{ |
|
|
|
|
descriptors_block[0] = sdx[0]; |
|
|
|
|
descriptors_block[1] = sdy[0]; |
|
|
|
|
descriptors_block[2] = sabs[0]; |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
sabs[tid] = fabs(sdy[tid]); // |dy| array |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum_old(sabs, tid); |
|
|
|
|
|
|
|
|
|
// write |dy| |
|
|
|
|
if (tid == 0) |
|
|
|
|
{ |
|
|
|
|
descriptors_block[3] = sabs[0]; |
|
|
|
|
descriptors_block[2] = sdxabs[0]; |
|
|
|
|
descriptors_block[3] = sdyabs[0]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1191,23 +1153,21 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location |
|
|
|
|
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features) |
|
|
|
|
{ |
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); |
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) |
|
|
|
|
__shared__ float sdx[25]; |
|
|
|
|
__shared__ float sdy[25]; |
|
|
|
|
|
|
|
|
|
calc_dx_dy_old(sdx, sdy, features, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// sum (reduce) 5x5 area response |
|
|
|
|
__shared__ float sd1[25]; |
|
|
|
|
__shared__ float sd2[25]; |
|
|
|
|
__shared__ float sdabs1[25]; |
|
|
|
|
__shared__ float sdabs2[25]; |
|
|
|
|
|
|
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
calc_dx_dy_old(sdx, sdy, features, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
if (sdy[tid] >= 0) |
|
|
|
|
{ |
|
|
|
|
sd1[tid] = sdx[tid]; |
|
|
|
@ -1224,10 +1184,10 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum_old(sd1, tid); |
|
|
|
|
reduce_sum_old(sd2, tid); |
|
|
|
|
reduce_sum_old(sdabs1, tid); |
|
|
|
|
reduce_sum_old(sdabs2, tid); |
|
|
|
|
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); |
|
|
|
|
|
|
|
|
|
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) |
|
|
|
|
if (tid == 0) |
|
|
|
@ -1255,10 +1215,8 @@ namespace cv { namespace gpu { namespace surf |
|
|
|
|
} |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
reduce_sum_old(sd1, tid); |
|
|
|
|
reduce_sum_old(sd2, tid); |
|
|
|
|
reduce_sum_old(sdabs1, tid); |
|
|
|
|
reduce_sum_old(sdabs2, tid); |
|
|
|
|
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); |
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) |
|
|
|
|
if (tid == 0) |
|
|
|
|