minor optimization of SURF_GPU (reduce memory transfers, use structure of arrays instead of array of structures)

pull/13383/head
Vladislav Vinogradov 14 years ago
parent 145a76faf4
commit 0b19f915be
  1. 20
      modules/gpu/doc/feature_detection_and_description.rst
  2. 23
      modules/gpu/include/opencv2/gpu/gpu.hpp
  3. 23
      modules/gpu/src/cuda/internal_shared.hpp
  4. 142
      modules/gpu/src/cuda/surf.cu
  5. 151
      modules/gpu/src/surf.cpp

@ -15,6 +15,17 @@ This class is used for extracting Speeded Up Robust Features (SURF) from an imag
class SURF_GPU : public CvSURFParams
{
public:
enum KeypointLayout
{
SF_X = 0,
SF_Y,
SF_LAPLACIAN,
SF_SIZE,
SF_DIR,
SF_HESSIAN,
SF_FEATURE_STRIDE
};
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
@ -67,22 +78,15 @@ This class is used for extracting Speeded Up Robust Features (SURF) from an imag
GpuMat det, trace;
GpuMat maxPosBuffer;
GpuMat featuresBuffer;
GpuMat keypointsBuffer;
};
The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8 bit grayscale images are supported.
The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat`` . The ``keypoints`` matrix is a one-row matrix of the ``CV_32FC6`` type. It contains 6 float values per feature: ``x, y, laplacian, size, dir, hessian`` . The ``descriptors`` matrix is
:math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat`` . The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 6` matrix with the ``CV_32FC1`` type. keypoints.ptr<float>(SF_X)[i] will contain x coordinate of i'th feature. keypoints.ptr<float>(SF_Y)[i] will contain y coordinate of i'th feature. keypoints.ptr<float>(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature. keypoints.ptr<float>(SF_SIZE)[i] will contain size of i'th feature. keypoints.ptr<float>(SF_DIR)[i] will contain orientation of i'th feature. keypoints.ptr<float>(SF_HESSIAN)[i] will contain response of i'th feature. The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
**Note:**
By default for user provided keypoints the class ``SURF_GPU`` recalculates keypoint's orientation and returns reodered/filtered keypoints array and coresponding decriptors array.
See Also: :c:type:`SURF`
.. index:: gpu::BruteForceMatcher_GPU

@ -1566,6 +1566,17 @@ namespace cv
class CV_EXPORTS SURF_GPU : public CvSURFParams
{
public:
enum KeypointLayout
{
SF_X = 0,
SF_Y,
SF_LAPLACIAN,
SF_SIZE,
SF_DIR,
SF_HESSIAN,
SF_FEATURE_STRIDE
};
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
@ -1585,9 +1596,13 @@ namespace cv
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have 1 row and type CV_32FC(6)
//! keypoints.at<float[6]>(1, i) contains i'th keypoint
//! format: (x, y, laplacian, size, dir, hessian)
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(SF_X)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(SF_Y)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(SF_SIZE)[i] will contain size of i'th feature
//! keypoints.ptr<float>(SF_DIR)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(SF_HESSIAN)[i] will contain response of i'th feature
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
@ -1611,8 +1626,6 @@ namespace cv
GpuMat det, trace;
GpuMat maxPosBuffer;
GpuMat featuresBuffer;
GpuMat keypointsBuffer;
};
}

@ -105,28 +105,7 @@ namespace cv
const textureReference* tex;
cudaSafeCall( cudaGetTextureReference(&tex, name) );
cudaSafeCall( cudaUnbindTexture(tex) );
}
struct KeyPoint_GPU
{
float x;
float y;
float laplacian;
float size;
float dir;
float hessian;
};
enum KeypointLayout
{
SF_X,
SF_Y,
SF_LAPLACIAN,
SF_SIZE,
SF_DIR,
SF_HESSIAN,
SF_FEATURE_STRIDE
};
}
}
}

@ -63,8 +63,6 @@ namespace cv { namespace gpu { namespace surf
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
// The maximum number of keypoints that memory is reserved for.
__constant__ int c_max_keypoints;
// The image size.
__constant__ int c_img_rows;
__constant__ int c_img_cols;
@ -346,7 +344,9 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
__global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@ -357,7 +357,6 @@ namespace cv { namespace gpu { namespace surf
const int layer = maxPos.z - 1 + threadIdx.z;
__shared__ float N9[3][3][3];
__shared__ KeyPoint_GPU p;
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
__syncthreads();
@ -422,33 +421,46 @@ namespace cv { namespace gpu { namespace surf
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
{
// if the step is within the interpolation region, perform it
const int size = calcSize(c_octave, maxPos.z);
// Get a new feature index.
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
if (ind < c_max_features)
{
const int size = calcSize(c_octave, maxPos.z);
const float px = center_j + x[0] * (1 << c_octave);
const float py = center_i + x[1] * (1 << c_octave);
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
const int ds = size - calcSize(c_octave, maxPos.z - 1);
const float psize = roundf(size + x[2] * ds);
p.x = center_j + x[0] * (1 << c_octave);
p.y = center_i + x[1] * (1 << c_octave);
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = psize * 1.2f / 9.0f;
int ds = size - calcSize(c_octave, maxPos.z - 1);
p.size = roundf(size + x[2] * ds);
/* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
p.laplacian = maxPos.w;
p.dir = 0.0f;
p.hessian = N9[1][1][1];
// check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{
// Get a new feature index.
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
// Should we split up this transfer over many threads?
featuresBuffer[ind] = p;
}
if (ind < c_max_features)
{
featureX[ind] = px;
featureY[ind] = py;
featureLaplacian[ind] = maxPos.w;
featureSize[ind] = psize;
featureHessian[ind] = N9[1][1][1];
}
} // grad_wav_size check
} // If the subpixel interpolation worked
}
} // If this is thread 0.
@ -456,7 +468,9 @@ namespace cv { namespace gpu { namespace surf
#endif
}
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter)
{
dim3 threads;
threads.x = 3;
@ -466,7 +480,7 @@ namespace cv { namespace gpu { namespace surf
dim3 grid;
grid.x = maxCounter;
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
@ -486,7 +500,7 @@ namespace cv { namespace gpu { namespace surf
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@ -497,16 +511,9 @@ namespace cv { namespace gpu { namespace surf
__shared__ float s_sumx[64 * 4];
__shared__ float s_sumy[64 * 4];
__shared__ float s_feature[6];
if (threadIdx.x < 6 && threadIdx.y == 0)
s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];
__syncthreads();
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
/* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s.
@ -526,8 +533,8 @@ namespace cv { namespace gpu { namespace surf
if (tid < ORI_SAMPLES)
{
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);
const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
{
@ -646,26 +653,12 @@ namespace cv { namespace gpu { namespace surf
if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
{
// Get a new feature index.
unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);
float kp_dir = atan2f(besty, bestx);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI;
kp_dir *= 180.0f / CV_PI;
if (ind < c_max_keypoints)
{
float kp_dir = atan2f(besty, bestx);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI;
kp_dir *= 180.0f / CV_PI;
__shared__ KeyPoint_GPU kp;
kp.x = s_feature[SF_X];
kp.y = s_feature[SF_Y];
kp.laplacian = s_feature[SF_LAPLACIAN];
kp.size = s_feature[SF_SIZE];
kp.dir = kp_dir;
kp.hessian = s_feature[SF_HESSIAN];
keypoints[ind] = kp;
}
featureDir[blockIdx.x] = kp_dir;
}
}
@ -676,7 +669,7 @@ namespace cv { namespace gpu { namespace surf
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{
dim3 threads;
threads.x = 64;
@ -685,7 +678,7 @@ namespace cv { namespace gpu { namespace surf
dim3 grid;
grid.x = nFeatures;
icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter);
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
@ -748,20 +741,16 @@ namespace cv { namespace gpu { namespace surf
return saturate_cast<unsigned char>(res);
}
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
int tid)
{
__shared__ float s_PATCH[6][6];
// get the interest point parameters (x, y, size, response, angle)
__shared__ float s_pt[5];
if (threadIdx.y == 0)
s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x];
__syncthreads();
const float centerX = s_pt[SF_X];
const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
const float centerX = featureX[blockIdx.x];
const float centerY = featureY[blockIdx.x];
const float size = featureSize[blockIdx.x];
const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
@ -838,7 +827,7 @@ namespace cv { namespace gpu { namespace surf
}
}
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
@ -848,7 +837,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(sdx, sdy, features, tid);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads();
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
@ -870,7 +859,7 @@ namespace cv { namespace gpu { namespace surf
}
}
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
@ -884,7 +873,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(sdx, sdy, features, tid);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads();
if (sdy[tid] >= 0)
@ -990,13 +979,14 @@ namespace cv { namespace gpu { namespace surf
descriptor_base[threadIdx.x] = lookup / len;
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
void compute_descriptors_gpu(const DevMem2Df& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
@ -1008,7 +998,7 @@ namespace cv { namespace gpu { namespace surf
}
else
{
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );

@ -69,11 +69,14 @@ namespace cv { namespace gpu { namespace surf
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter);
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu(const DevMem2Df& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}}}
using namespace cv::gpu::surf;
@ -88,7 +91,7 @@ namespace
sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace),
maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer),
maxPosBuffer(surf.maxPosBuffer),
img_cols(img.cols), img_rows(img.rows),
@ -101,18 +104,16 @@ namespace
CV_Assert(nOctaves > 0 && nOctaveLayers > 0);
CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
maxKeypoints = min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
maxFeatures = min(static_cast<int>(1.5 * maxKeypoints), 65535);
maxFeatures = min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
maxCandidates = min(static_cast<int>(1.5 * maxFeatures), 65535);
CV_Assert(maxKeypoints > 0);
CV_Assert(maxFeatures > 0);
cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) );
cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) );
cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 1) * sizeof(unsigned int)) );
cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 1) * sizeof(unsigned int)) );
uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates);
uploadConstant("cv::gpu::surf::c_max_features", maxFeatures);
uploadConstant("cv::gpu::surf::c_max_keypoints", maxKeypoints);
uploadConstant("cv::gpu::surf::c_img_rows", img_rows);
uploadConstant("cv::gpu::surf::c_img_cols", img_cols);
uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers);
@ -148,7 +149,8 @@ namespace
ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace);
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer);
ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer);
ensureSizeIsEnough(SURF_GPU::SF_FEATURE_STRIDE, maxFeatures, CV_32FC1, keypoints);
keypoints.setTo(Scalar::all(0));
for (int octave = 0; octave < nOctaves; ++octave)
{
@ -161,60 +163,49 @@ namespace
icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 2 + octave,
icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 1 + octave,
img_rows, img_cols, octave, use_mask, nOctaveLayers);
unsigned int maxCounter;
cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
if (maxCounter > 0)
{
icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter,
featuresBuffer.ptr<KeyPoint_GPU>(), d_counters);
keypoints.ptr<float>(SURF_GPU::SF_X), keypoints.ptr<float>(SURF_GPU::SF_Y),
keypoints.ptr<int>(SURF_GPU::SF_LAPLACIAN), keypoints.ptr<float>(SURF_GPU::SF_SIZE),
keypoints.ptr<float>(SURF_GPU::SF_HESSIAN), d_counters);
}
}
unsigned int featureCounter;
cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
keypoints.cols = featureCounter;
if (!upright)
findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints);
else
{
if (featureCounter > 0)
featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);
else
keypoints.release();
}
findOrientation(keypoints);
}
void findOrientation(const GpuMat& features, GpuMat& keypoints)
void findOrientation(GpuMat& keypoints)
{
if (features.cols > 0)
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer);
icvCalcOrientation_gpu(features.ptr<KeyPoint_GPU>(), features.cols, keypointsBuffer.ptr<KeyPoint_GPU>(),
d_counters + 1);
unsigned int keypointsCounter;
cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
keypointsCounter = std::min(keypointsCounter, static_cast<unsigned int>(maxKeypoints));
if (keypointsCounter > 0)
keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints);
else
keypoints.release();
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_GPU::SF_X), keypoints.ptr<float>(SURF_GPU::SF_Y),
keypoints.ptr<float>(SURF_GPU::SF_SIZE), keypoints.ptr<float>(SURF_GPU::SF_DIR), nFeatures);
}
}
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
{
if (keypoints.cols > 0)
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
descriptors.create(keypoints.cols, descriptorSize, CV_32F);
compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
descriptors.create(nFeatures, descriptorSize, CV_32F);
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_GPU::SF_X), keypoints.ptr<float>(SURF_GPU::SF_Y),
keypoints.ptr<float>(SURF_GPU::SF_SIZE), keypoints.ptr<float>(SURF_GPU::SF_DIR), nFeatures);
}
}
@ -228,8 +219,6 @@ namespace
GpuMat& trace;
GpuMat& maxPosBuffer;
GpuMat& featuresBuffer;
GpuMat& keypointsBuffer;
int img_cols, img_rows;
@ -239,7 +228,6 @@ namespace
int maxCandidates;
int maxFeatures;
int maxKeypoints;
unsigned int* d_counters;
};
@ -276,22 +264,24 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
keypointsGPU.release();
else
{
Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));
Mat keypointsCPU(SURF_GPU::SF_FEATURE_STRIDE, keypoints.size(), CV_32FC1);
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::SF_X);
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::SF_Y);
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::SF_LAPLACIAN);
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SF_SIZE);
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::SF_DIR);
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::SF_HESSIAN);
for (size_t i = 0; i < keypoints.size(); ++i)
for (size_t i = 0, size = keypoints.size(); i < size; ++i)
{
const KeyPoint& kp = keypoints[i];
KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
gkp.x = kp.pt.x;
gkp.y = kp.pt.y;
gkp.laplacian = 1.0f;
gkp.size = kp.size;
gkp.dir = kp.angle;
gkp.hessian = kp.response;
kp_x[i] = kp.pt.x;
kp_y[i] = kp.pt.y;
kp_size[i] = kp.size;
kp_dir[i] = kp.angle;
kp_hessian[i] = kp.response;
kp_laplacian[i] = 1;
}
keypointsGPU.upload(keypointsCPU);
@ -314,7 +304,7 @@ namespace
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params)
int getPointOctave(float size, const CvSURFParams& params)
{
int best_octave = 0;
float min_diff = numeric_limits<float>::max();
@ -322,7 +312,7 @@ namespace
{
for (int layer = 0; layer < params.nOctaveLayers; ++layer)
{
float diff = std::abs(kpt.size - (float)calcSize(octave, layer));
float diff = std::abs(size - (float)calcSize(octave, layer));
if (min_diff > diff)
{
min_diff = diff;
@ -338,32 +328,35 @@ namespace
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
{
if (keypointsGPU.empty())
const int nFeatures = keypointsGPU.cols;
if (nFeatures == 0)
keypoints.clear();
else
{
CV_Assert(keypointsGPU.type() == CV_32FC(6) && keypointsGPU.isContinuous());
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE);
Mat keypointsCPU = keypointsGPU;
keypoints.resize(keypointsGPU.cols);
keypoints.resize(nFeatures);
for (int i = 0; i < keypointsGPU.cols; ++i)
float* kp_x = keypointsCPU.ptr<float>(SF_X);
float* kp_y = keypointsCPU.ptr<float>(SF_Y);
int* kp_laplacian = keypointsCPU.ptr<int>(SF_LAPLACIAN);
float* kp_size = keypointsCPU.ptr<float>(SF_SIZE);
float* kp_dir = keypointsCPU.ptr<float>(SF_DIR);
float* kp_hessian = keypointsCPU.ptr<float>(SF_HESSIAN);
for (int i = 0; i < nFeatures; ++i)
{
KeyPoint& kp = keypoints[i];
const KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
kp.pt.x = gkp.x;
kp.pt.y = gkp.y;
kp.size = gkp.size;
kp.angle = gkp.dir;
kp.response = gkp.hessian;
kp.octave = getPointOctave(gkp, *this);
kp.class_id = static_cast<int>(gkp.laplacian);
kp.pt.x = kp_x[i];
kp.pt.y = kp_y[i];
kp.class_id = kp_laplacian[i];
kp.size = kp_size[i];
kp.angle = kp_dir[i];
kp.response = kp_hessian[i];
kp.octave = getPointOctave(kp.size, *this);
}
}
}
@ -403,9 +396,7 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
surf.detectKeypoints(keypoints);
else if (!upright)
{
GpuMat keypointsBuf;
surf.findOrientation(keypoints, keypointsBuf);
keypointsBuf.copyTo(keypoints);
surf.findOrientation(keypoints);
}
surf.computeDescriptors(keypoints, descriptors, descriptorSize());

Loading…
Cancel
Save