fixed bug in SURF_GPU (compute descriptors, tid < 25)

pull/13383/head
Vladislav Vinogradov 14 years ago
parent 6259520aa1
commit 331062360d
  1. 132
      modules/gpu/src/cuda/surf.cu

@ -831,22 +831,25 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
sdxabs[tid] = fabs(sdx[tid]); // |dx| array if (tid < 25)
sdyabs[tid] = fabs(sdy[tid]); // |dy| array {
__syncthreads(); sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads();
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
__syncthreads(); __syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2); float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
// write dx, dy, |dx|, |dy| // write dx, dy, |dx|, |dy|
if (tid == 0) if (tid == 0)
{ {
descriptors_block[0] = sdx[0]; descriptors_block[0] = sdx[0];
descriptors_block[1] = sdy[0]; descriptors_block[1] = sdy[0];
descriptors_block[2] = sdxabs[0]; descriptors_block[2] = sdxabs[0];
descriptors_block[3] = sdyabs[0]; descriptors_block[3] = sdyabs[0];
}
} }
} }
@ -867,63 +870,66 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (sdy[tid] >= 0) if (tid < 25)
{
sd1[tid] = sdx[tid];
sdabs1[tid] = fabs(sdx[tid]);
sd2[tid] = 0;
sdabs2[tid] = 0;
}
else
{ {
sd1[tid] = 0; if (sdy[tid] >= 0)
sdabs1[tid] = 0; {
sd2[tid] = sdx[tid]; sd1[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]); sdabs1[tid] = fabs(sdx[tid]);
} sd2[tid] = 0;
__syncthreads(); sdabs2[tid] = 0;
}
else
{
sd1[tid] = 0;
sdabs1[tid] = 0;
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
__syncthreads();
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads(); __syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0) if (tid == 0)
{ {
descriptors_block[0] = sd1[0]; descriptors_block[0] = sd1[0];
descriptors_block[1] = sdabs1[0]; descriptors_block[1] = sdabs1[0];
descriptors_block[2] = sd2[0]; descriptors_block[2] = sd2[0];
descriptors_block[3] = sdabs2[0]; descriptors_block[3] = sdabs2[0];
} }
__syncthreads(); __syncthreads();
if (sdx[tid] >= 0) if (sdx[tid] >= 0)
{ {
sd1[tid] = sdy[tid]; sd1[tid] = sdy[tid];
sdabs1[tid] = fabs(sdy[tid]); sdabs1[tid] = fabs(sdy[tid]);
sd2[tid] = 0; sd2[tid] = 0;
sdabs2[tid] = 0; sdabs2[tid] = 0;
} }
else else
{ {
sd1[tid] = 0; sd1[tid] = 0;
sdabs1[tid] = 0; sdabs1[tid] = 0;
sd2[tid] = sdy[tid]; sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]); sdabs2[tid] = fabs(sdy[tid]);
} }
__syncthreads(); __syncthreads();
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads(); __syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0) if (tid == 0)
{ {
descriptors_block[4] = sd1[0]; descriptors_block[4] = sd1[0];
descriptors_block[5] = sdabs1[0]; descriptors_block[5] = sdabs1[0];
descriptors_block[6] = sd2[0]; descriptors_block[6] = sd2[0];
descriptors_block[7] = sdabs2[0]; descriptors_block[7] = sdabs2[0];
}
} }
} }

Loading…
Cancel
Save