diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 4d117c0339..a7f49e8f42 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; + __device__ void reduceSum32(volatile float* v_sum, float& sum) + { + v_sum[threadIdx.x] = sum; + + if (threadIdx.x < 16) + { + v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16]; + v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8]; + v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4]; + v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2]; + v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1]; + } + } + __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 @@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf __shared__ float s_Y[128]; __shared__ float s_angle[128]; - __shared__ float s_sumx[64 * 4]; - __shared__ float s_sumy[64 * 4]; + __shared__ float s_sum[32 * 4]; /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ @@ -525,35 +538,30 @@ namespace cv { namespace gpu { namespace surf if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) { // Calc X, Y, angle and store it to shared memory - { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; - float X = 0.0f, Y = 0.0f, angle = 0.0f; + float X = 0.0f, Y = 0.0f, angle = 0.0f; - if (tid < ORI_SAMPLES) - { - const float margin = (float)(grad_wav_size - 1) / 2.0f; - const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin); - const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin); + if (tid < ORI_SAMPLES) + { + const float margin = (float)(grad_wav_size - 1) / 2.0f; + const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin); + const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin); - if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size)) - { - X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x); - Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x); - - angle = atan2f(Y, X); - if (angle < 0) - angle += 2.0f * CV_PI; - angle *= 180.0f / CV_PI; - } - } - if (tid < 128) + if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size)) { - s_X[tid] = X; - s_Y[tid] = Y; - s_angle[tid] = angle; + X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x); + Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x); + + angle = atan2f(Y, X); + if (angle < 0) + angle += 2.0f * CV_PI; + angle *= 180.0f / CV_PI; } } + s_X[tid] = X; + s_Y[tid] = Y; + s_angle[tid] = angle; __syncthreads(); float bestx = 0, besty = 0, best_mod = 0; @@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf sumx = s_X[threadIdx.x]; sumy = s_Y[threadIdx.x]; } + d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir); + if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) + { + sumx += s_X[threadIdx.x + 32]; + sumy += s_Y[threadIdx.x + 32]; + } d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir); if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) { sumx += s_X[threadIdx.x + 64]; sumy += s_Y[threadIdx.x + 64]; } - - float* s_sumx_row = s_sumx + threadIdx.y * 64; - float* s_sumy_row = s_sumy + threadIdx.y * 64; - - s_sumx_row[threadIdx.x] = sumx; - s_sumy_row[threadIdx.x] = sumy; - __syncthreads(); - - if (threadIdx.x < 32) + d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir); + if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) { - volatile float* v_sumx_row = s_sumx_row; - volatile float* v_sumy_row = s_sumy_row; - - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32]; - - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16]; - - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8]; - - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4]; + sumx += s_X[threadIdx.x + 96]; + sumy += s_Y[threadIdx.x + 96]; + } - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2]; + float* s_sum_row = s_sum + threadIdx.y * 32; - v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1]; - v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1]; - } + reduceSum32(s_sum_row, sumx); + reduceSum32(s_sum_row, sumy); const float temp_mod = sumx * sumx + sumy * sumy; if (temp_mod > best_mod) @@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf bestx = sumx; besty = sumy; } - __syncthreads(); } if (threadIdx.x == 0) @@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) { dim3 threads; - threads.x = 64; + threads.x = 32; threads.y = 4; dim3 grid; @@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf } __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], - const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, - int tid) + const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) { __shared__ float s_PATCH[6][6]; @@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf if (threadIdx.x < 5 && threadIdx.y < 5) { - tid = threadIdx.y * 5 + threadIdx.x; + const int tid = threadIdx.y * 5 + threadIdx.x; const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; @@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf __shared__ float sdxabs[25]; __shared__ float sdyabs[25]; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid); + calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir); __syncthreads(); + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array __syncthreads(); @@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf __shared__ float sdabs1[25]; __shared__ float sdabs2[25]; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid); + calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir); __syncthreads(); + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + if (sdy[tid] >= 0) { sd1[tid] = sdx[tid];