SURF kind of works (let's see if the tests pass)

pull/2281/head
Vadim Pisarevsky 11 years ago
parent 8d5e952263
commit c18d1ee2a9
  1. 2
      modules/core/include/opencv2/core/ocl.hpp
  2. 475
      modules/nonfree/src/opencl/surf.cl
  3. 2
      modules/nonfree/src/surf.cpp
  4. 16
      modules/nonfree/src/surf.hpp
  5. 176
      modules/nonfree/src/surf.ocl.cpp

@ -585,7 +585,7 @@ class CV_EXPORTS Image2D
{ {
public: public:
Image2D(); Image2D();
Image2D(const UMat &src); explicit Image2D(const UMat &src);
~Image2D(); ~Image2D();
void* ptr() const; void* ptr() const;

@ -52,35 +52,52 @@
#define ORI_LOCAL_SIZE (360 / ORI_SEARCH_INC) #define ORI_LOCAL_SIZE (360 / ORI_SEARCH_INC)
// specialized for non-image2d_t supported platform, intel HD4000, for example // specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D #ifndef HAVE_IMAGE2D
#define IMAGE_INT32 __global uint * __inline uint read_sumTex_(__global uint* sumTex, int sum_step, int img_rows, int img_cols, int2 coord)
#define IMAGE_INT8 __global uchar * {
#else int x = clamp(coord.x, 0, img_cols);
#define IMAGE_INT32 image2d_t int y = clamp(coord.y, 0, img_rows);
#define IMAGE_INT8 image2d_t return sumTex[sum_step * y + x];
#endif }
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow) __inline uchar read_imgTex_(__global uchar* imgTex, int img_step, int img_rows, int img_cols, float2 coord)
{ {
#ifdef DISABLE_IMAGE2D int x = clamp(convert_int_rte(coord.x), 0, img_cols-1);
int x = clamp(coord.x, 0, cols); int y = clamp(convert_int_rte(coord.y), 0, img_rows-1);
int y = clamp(coord.y, 0, rows); return imgTex[img_step * y + x];
return img[elemPerRow * y + x]; }
#define read_sumTex(coord) read_sumTex_(sumTex, sum_step, img_rows, img_cols, coord)
#define read_imgTex(coord) read_imgTex_(imgTex, img_step, img_rows, img_cols, coord)
#define __PARAM_sumTex__ __global uint* sumTex, int sum_step, int sum_offset
#define __PARAM_imgTex__ __global uchar* imgTex, int img_step, int img_offset
#define __PASS_sumTex__ sumTex, sum_step, sum_offset
#define __PASS_imgTex__ imgTex, img_step, img_offset
#else #else
return read_imageui(img, sam, coord).x; __inline uint read_sumTex_(image2d_t sumTex, sampler_t sam, int2 coord)
#endif {
return read_imageui(sumTex, sam, coord).x;
} }
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
__inline uchar read_imgTex_(image2d_t imgTex, sampler_t sam, float2 coord)
{ {
#ifdef DISABLE_IMAGE2D return (uchar)read_imageui(imgTex, sam, coord).x;
int x = clamp(round(coord.x), 0, cols - 1);
int y = clamp(round(coord.y), 0, rows - 1);
return img[elemPerRow * y + x];
#else
return (uchar)read_imageui(img, sam, coord).x;
#endif
} }
#define read_sumTex(coord) read_sumTex_(sumTex, sampler, coord)
#define read_imgTex(coord) read_imgTex_(imgTex, sampler, coord)
#define __PARAM_sumTex__ image2d_t sumTex
#define __PARAM_imgTex__ image2d_t imgTex
#define __PASS_sumTex__ sumTex
#define __PASS_imgTex__ imgTex
#endif
// dynamically change the precision used for floating type // dynamically change the precision used for floating type
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
@ -95,7 +112,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col
#endif #endif
// Image read mode // Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
#ifndef FLT_EPSILON #ifndef FLT_EPSILON
#define FLT_EPSILON (1e-15) #define FLT_EPSILON (1e-15)
@ -105,45 +122,6 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
#define CV_PI_F 3.14159265f #define CV_PI_F 3.14159265f
#endif #endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex,
__constant float2 *src,
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
F d = 0;
int2 dx1 = convert_int2(round(ratio * src[0]));
int2 dy1 = convert_int2(round(ratio * src[1]));
int2 dx2 = convert_int2(round(ratio * src[2]));
int2 dy2 = convert_int2(round(ratio * src[3]));
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
return (float)d;
}
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Hessian // Hessian
@ -182,22 +160,20 @@ F calcAxisAlignedDerivative(
//calculate targeted layer per-pixel determinant and trace with an integral image //calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void SURF_calcLayerDetAndTrace( __kernel void SURF_calcLayerDetAndTrace(
IMAGE_INT32 sumTex, // input integral image __PARAM_sumTex__, // input integral image
__global float * det, // output Determinant int img_rows, int img_cols,
int c_nOctaveLayers, int c_octave, int c_layer_rows,
__global float * det, // output determinant
int det_step, int det_offset,
__global float * trace, // output trace __global float * trace, // output trace
int det_step, // the step of det in bytes int trace_step, int trace_offset)
int trace_step, // the step of trace in bytes
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows,
int sumTex_step
)
{ {
det_step /= sizeof(*det); det_step /= sizeof(*det);
trace_step /= sizeof(*trace); trace_step /= sizeof(*trace);
sumTex_step/= sizeof(uint); #ifndef HAVE_IMAGE2D
sum_step/= sizeof(uint);
#endif
// Determine the indices // Determine the indices
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
const int blockIdx_y = get_group_id(1) % gridDim_y; const int blockIdx_y = get_group_id(1) % gridDim_y;
@ -209,13 +185,13 @@ __kernel void SURF_calcLayerDetAndTrace(
const int size = calcSize(c_octave, layer); const int size = calcSize(c_octave, layer);
const int samples_i = 1 + ((c_img_rows - size) >> c_octave); const int samples_i = 1 + ((img_rows - size) >> c_octave);
const int samples_j = 1 + ((c_img_cols - size) >> c_octave); const int samples_j = 1 + ((img_cols - size) >> c_octave);
// Ignore pixels where some of the kernel is outside the image // Ignore pixels where some of the kernel is outside the image
const int margin = (size >> 1) >> c_octave; const int margin = (size >> 1) >> c_octave;
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) if (size <= img_rows && size <= img_cols && i < samples_i && j < samples_j)
{ {
int x = j << c_octave; int x = j << c_octave;
int y = i << c_octave; int y = i << c_octave;
@ -239,14 +215,14 @@ __kernel void SURF_calcLayerDetAndTrace(
{ {
// Some of the pixels needed to compute the derivative are // Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here. // repeated, so we only don't duplicate the fetch here.
int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t02 = read_sumTex( (int2)(x, y + r2));
int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t07 = read_sumTex( (int2)(x, y + r7));
int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t32 = read_sumTex( (int2)(x + r3, y + r2));
int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t37 = read_sumTex( (int2)(x + r3, y + r7));
int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t62 = read_sumTex( (int2)(x + r6, y + r2));
int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t67 = read_sumTex( (int2)(x + r6, y + r7));
int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t92 = read_sumTex( (int2)(x + r9, y + r2));
int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t97 = read_sumTex( (int2)(x + r9, y + r7));
d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2), d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
t62, t67, t92, t97, (r9 - r6) * (r7 - r2), t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
@ -259,14 +235,14 @@ __kernel void SURF_calcLayerDetAndTrace(
{ {
// Some of the pixels needed to compute the derivative are // Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here. // repeated, so we only don't duplicate the fetch here.
int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step ); int t20 = read_sumTex( (int2)(x + r2, y) );
int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step ); int t23 = read_sumTex( (int2)(x + r2, y + r3) );
int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step ); int t70 = read_sumTex( (int2)(x + r7, y) );
int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step ); int t73 = read_sumTex( (int2)(x + r7, y + r3) );
int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step ); int t26 = read_sumTex( (int2)(x + r2, y + r6) );
int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step ); int t76 = read_sumTex( (int2)(x + r7, y + r6) );
int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step ); int t29 = read_sumTex( (int2)(x + r2, y + r9) );
int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step ); int t79 = read_sumTex( (int2)(x + r7, y + r9) );
d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3), d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
t26, t29, t76, t79, (r7 - r2) * (r9 - r6), t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
@ -280,31 +256,31 @@ __kernel void SURF_calcLayerDetAndTrace(
// There's no saving us here, we just have to get all of the pixels in // There's no saving us here, we just have to get all of the pixels in
// separate fetches // separate fetches
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r1, y + r1) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r1, y + r4) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r4, y + r1) );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r4, y + r4) );
d += t / ((r4 - r1) * (r4 - r1)); d += t / ((r4 - r1) * (r4 - r1));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r5, y + r1) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r5, y + r4) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r8, y + r1) );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r8, y + r4) );
d -= t / ((r8 - r5) * (r4 - r1)); d -= t / ((r8 - r5) * (r4 - r1));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r1, y + r5) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r1, y + r8) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r4, y + r5) );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r4, y + r8) );
d -= t / ((r4 - r1) * (r8 - r5)); d -= t / ((r4 - r1) * (r8 - r5));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r5, y + r5) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r5, y + r8) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r8, y + r5) );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r8, y + r8) );
d += t / ((r8 - r5) * (r8 - r5)); d += t / ((r8 - r5) * (r8 - r5));
} }
const float dxy = (float)d; const float dxy = (float)d;
@ -317,171 +293,17 @@ __kernel void SURF_calcLayerDetAndTrace(
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// NONMAX // NONMAX
__constant float c_DM[5] = {0, 0, 9, 9, 1};
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
{
float ratio = (float)size / 9.0f;
float d = 0;
int dx1 = round(ratio * c_DM[0]);
int dy1 = round(ratio * c_DM[1]);
int dx2 = round(ratio * c_DM[2]);
int dy2 = round(ratio * c_DM[3]);
float t = 0;
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f);
}
// Non-maximal suppression to further filtering the candidates from previous step
__kernel
void SURF_findMaximaInLayerWithMask(
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
volatile __global int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
int trace_step, // the step of trace in bytes
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows,
int c_layer_cols,
int c_max_candidates,
float c_hessianThreshold,
IMAGE_INT32 maskSumTex,
int mask_step
)
{
volatile __local float N9[768]; // threads.x * threads.y * 3
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
maxCounter += counter_offset;
mask_step /= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
const int blockIdx_y = get_group_id(1) % gridDim_y;
const int blockIdx_z = get_group_id(1) / gridDim_y;
const int layer = blockIdx_z + 1;
const int size = calcSize(c_octave, layer);
// Ignore pixels without a 3x3x3 neighbourhood in the layer above
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] =
det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] =
det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] =
det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
float val0 = N9[localLin];
if (val0 > c_hessianThreshold)
{
// Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
&& val0 > N9[localLin - get_local_size(0) - zoff]
&& val0 > N9[localLin + 1 - get_local_size(0) - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + get_local_size(0) - zoff]
&& val0 > N9[localLin + get_local_size(0) - zoff]
&& val0 > N9[localLin + 1 + get_local_size(0) - zoff]
&& val0 > N9[localLin - 1 - get_local_size(0)]
&& val0 > N9[localLin - get_local_size(0)]
&& val0 > N9[localLin + 1 - get_local_size(0)]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + get_local_size(0)]
&& val0 > N9[localLin + get_local_size(0)]
&& val0 > N9[localLin + 1 + get_local_size(0)]
&& val0 > N9[localLin - 1 - get_local_size(0) + zoff]
&& val0 > N9[localLin - get_local_size(0) + zoff]
&& val0 > N9[localLin + 1 - get_local_size(0) + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + get_local_size(0) + zoff]
&& val0 > N9[localLin + get_local_size(0) + zoff]
&& val0 > N9[localLin + 1 + get_local_size(0) + zoff]
;
if(condmax)
{
int ind = atomic_inc(maxCounter);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
}
}
}
}
}
}
__kernel __kernel
void SURF_findMaximaInLayer( void SURF_findMaximaInLayer(
__global float * det, __global float * det,
int det_step, int det_offset,
__global float * trace, __global float * trace,
int trace_step, int trace_offset,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int img_rows,
int trace_step, // the step of trace in bytes int img_cols,
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers, int c_nOctaveLayers,
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
@ -515,8 +337,8 @@ void SURF_findMaximaInLayer(
const int zoff = get_local_size(0) * get_local_size(1); const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
int l_x = min(max(j, 0), c_img_cols - 1); int l_x = min(max(j, 0), img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); int l_y = c_layer_rows * layer + min(max(i, 0), img_rows - 1);
N9[localLin - zoff] = N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x]; det[det_step * (l_y - c_layer_rows) + l_x];
@ -596,7 +418,7 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x)
if (det != 0) if (det != 0)
{ {
F invdet = 1.0 / det; F invdet = 1.0f / det;
x[0] = invdet * x[0] = invdet *
(b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) - (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) -
@ -632,13 +454,13 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x)
__kernel __kernel
void SURF_interpolateKeypoint( void SURF_interpolateKeypoint(
__global const float * det, __global const float * det,
int det_step, int det_offset,
__global const int4 * maxPosBuffer, __global const int4 * maxPosBuffer,
__global float * keypoints, __global float * keypoints,
volatile __global int * featureCounter, int keypoints_step, int keypoints_offset,
int det_step, volatile __global int* featureCounter,
int keypoints_step, int img_rows,
int c_img_rows, int img_cols,
int c_img_cols,
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
int c_max_features int c_max_features
@ -730,7 +552,7 @@ void SURF_interpolateKeypoint(
const int grad_wav_size = 2 * round(2.0f * s); const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big // check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) if ((img_rows + 1) >= grad_wav_size && (img_cols + 1) >= grad_wav_size)
{ {
// Get a new feature index. // Get a new feature index.
int ind = atomic_inc(featureCounter); int ind = atomic_inc(featureCounter);
@ -836,22 +658,18 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
__kernel __kernel
void SURF_calcOrientation( void SURF_calcOrientation(
IMAGE_INT32 sumTex, __PARAM_sumTex__, int img_rows, int img_cols,
__global float * keypoints, __global float * keypoints, int keypoints_step, int keypoints_offset )
int keypoints_step,
int c_img_rows,
int c_img_cols,
int sum_step
)
{ {
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
#ifndef HAVE_IMAGE2D
sum_step /= sizeof(uint); sum_step /= sizeof(uint);
#endif
__global float* featureX = keypoints + X_ROW * keypoints_step; __global float* featureX = keypoints + X_ROW * keypoints_step;
__global float* featureY = keypoints + Y_ROW * keypoints_step; __global float* featureY = keypoints + Y_ROW * keypoints_step;
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step; __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
__local float s_X[ORI_SAMPLES]; __local float s_X[ORI_SAMPLES];
__local float s_Y[ORI_SAMPLES]; __local float s_Y[ORI_SAMPLES];
__local float s_angle[ORI_SAMPLES]; __local float s_angle[ORI_SAMPLES];
@ -866,7 +684,6 @@ void SURF_calcOrientation(
and building the keypoint descriptor are defined relative to 's' */ and building the keypoint descriptor are defined relative to 's' */
const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f; const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
/* To find the dominant orientation, the gradients in x and y are /* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s. sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the We ensure the gradient wavelet size is even to ensure the
@ -874,7 +691,7 @@ void SURF_calcOrientation(
const int grad_wav_size = 2 * round(2.0f * s); const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big // check when grad_wav_size is too big
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size) if ((img_rows + 1) < grad_wav_size || (img_cols + 1) < grad_wav_size)
return; return;
// Calc X, Y, angle and store it to shared memory // Calc X, Y, angle and store it to shared memory
@ -886,8 +703,8 @@ void SURF_calcOrientation(
float ratio = (float)grad_wav_size / 4; float ratio = (float)grad_wav_size / 4;
int r2 = round(ratio * 2.0); int r2 = round(ratio * 2.0f);
int r4 = round(ratio * 4.0); int r4 = round(ratio * 4.0f);
for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE ) for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
{ {
float X = 0.0f, Y = 0.0f, angle = 0.0f; float X = 0.0f, Y = 0.0f, angle = 0.0f;
@ -895,21 +712,20 @@ void SURF_calcOrientation(
const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin); const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin); const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && if (y >= 0 && y < (img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size) x >= 0 && x < (img_cols + 1) - grad_wav_size)
{ {
float apt = c_aptW[i]; float apt = c_aptW[i];
// Compute the haar sum without fetching duplicate pixels. // Compute the haar sum without fetching duplicate pixels.
float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step); float t00 = read_sumTex( (int2)(x, y));
float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step); float t02 = read_sumTex( (int2)(x, y + r2));
float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step); float t04 = read_sumTex( (int2)(x, y + r4));
float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step); float t20 = read_sumTex( (int2)(x + r2, y));
float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step); float t24 = read_sumTex( (int2)(x + r2, y + r4));
float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step); float t40 = read_sumTex( (int2)(x + r4, y));
float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step); float t42 = read_sumTex( (int2)(x + r4, y + r2));
float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step); float t44 = read_sumTex( (int2)(x + r4, y + r4));
F t = t00 - t04 - t20 + t24; F t = t00 - t04 - t20 + t24;
X -= t / ((r2) * (r4)); X -= t / ((r2) * (r4));
@ -1001,7 +817,7 @@ void SURF_calcOrientation(
} }
__kernel __kernel
void SURF_setUpright( void SURF_setUpRight(
__global float * keypoints, __global float * keypoints,
int keypoints_step, int keypoints_offset, int keypoints_step, int keypoints_offset,
int rows, int cols ) int rows, int cols )
@ -1050,22 +866,14 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
}; };
// utility for linear filter // utility for linear filter
inline uchar readerGet( #define readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, i, j) \
IMAGE_INT8 src, read_imgTex((float2)(centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir, \
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir))
int i, int j, int rows, int cols, int elemPerRow
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
}
inline float linearFilter( inline float linearFilter(
IMAGE_INT8 src, __PARAM_imgTex__, int img_rows, int img_cols,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, float centerX, float centerY, float win_offset,
float y, float x, int rows, int cols, int elemPerRow float cos_dir, float sin_dir, float y, float x )
)
{ {
x -= 0.5f; x -= 0.5f;
y -= 0.5f; y -= 0.5f;
@ -1077,34 +885,31 @@ inline float linearFilter(
const int x2 = x1 + 1; const int x2 = x1 + 1;
const int y2 = y1 + 1; const int y2 = y1 + 1;
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow); uchar src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
out = out + src_reg * ((x2 - x) * (y2 - y)); out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
out = out + src_reg * ((x - x1) * (y2 - y)); out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
out = out + src_reg * ((x2 - x) * (y - y1)); out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
out = out + src_reg * ((x - x1) * (y - y1)); out = out + src_reg * ((x - x1) * (y - y1));
return out; return out;
} }
void calc_dx_dy( void calc_dx_dy(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
int img_rows, int img_cols,
volatile __local float *s_dx_bin, volatile __local float *s_dx_bin,
volatile __local float *s_dy_bin, volatile __local float *s_dy_bin,
volatile __local float *s_PATCH, volatile __local float *s_PATCH,
__global const float* featureX, __global const float* featureX,
__global const float* featureY, __global const float* featureY,
__global const float* featureSize, __global const float* featureSize,
__global const float* featureDir, __global const float* featureDir )
int rows,
int cols,
int elemPerRow
)
{ {
const float centerX = featureX[get_group_id(0)]; const float centerX = featureX[get_group_id(0)];
const float centerY = featureY[get_group_id(0)]; const float centerY = featureY[get_group_id(0)];
@ -1141,7 +946,9 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); s_PATCH[get_local_id(1) * 6 + get_local_id(0)] =
linearFilter(__PASS_imgTex__, img_rows, img_cols, centerX, centerY,
win_offset, cos_dir, sin_dir, icoo, jcoo);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1232,9 +1039,8 @@ void reduce_sum25(
__kernel __kernel
void SURF_computeDescriptors64( void SURF_computeDescriptors64(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
int img_step, int img_offset, int img_rows, int img_cols,
int rows, int cols,
__global const float* keypoints, __global const float* keypoints,
int keypoints_step, int keypoints_offset, int keypoints_step, int keypoints_offset,
__global float * descriptors, __global float * descriptors,
@ -1254,7 +1060,7 @@ void SURF_computeDescriptors64(
volatile __local float sdyabs[25]; volatile __local float sdyabs[25];
volatile __local float s_PATCH[6*6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1286,9 +1092,8 @@ void SURF_computeDescriptors64(
__kernel __kernel
void SURF_computeDescriptors128( void SURF_computeDescriptors128(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
int img_step, int img_offset, int img_rows, int img_cols,
int rows, int cols,
__global const float* keypoints, __global const float* keypoints,
int keypoints_step, int keypoints_offset, int keypoints_step, int keypoints_offset,
__global float* descriptors, __global float* descriptors,
@ -1313,7 +1118,7 @@ void SURF_computeDescriptors128(
volatile __local float sdabs2[25]; volatile __local float sdabs2[25];
volatile __local float s_PATCH[6*6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1486,7 +1291,7 @@ void reduce_sum64(volatile __local float* smem, int tid)
} }
__kernel __kernel
void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step) void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step, int descriptors_offset)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID
@ -1514,7 +1319,7 @@ void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_
} }
__kernel __kernel
void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step) void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step, int descriptors_offset)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID

@ -902,7 +902,7 @@ void SURF::operator()(InputArray _img, InputArray _mask,
bool doDescriptors = _descriptors.needed(); bool doDescriptors = _descriptors.needed();
CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4)); CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4));
CV_Assert(_descriptors.needed() && !useProvidedKeypoints); CV_Assert(_descriptors.needed() || !useProvidedKeypoints);
if( ocl::useOpenCL() ) if( ocl::useOpenCL() )
{ {

@ -54,14 +54,11 @@ protected:
bool setImage(InputArray img, InputArray mask); bool setImage(InputArray img, InputArray mask);
// kernel callers declarations // kernel callers declarations
bool calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int layer_rows); bool calcLayerDetAndTrace(int octave, int layer_rows);
bool findMaximaInLayer(const UMat &det, const UMat &trace, UMat &maxPosBuffer, bool findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols);
UMat &maxCounter, int counterOffset,
int octave, int layer_rows, int layer_cols);
bool interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, bool interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int maxFeatures);
UMat &keypoints, UMat &counters, int octave, int layer_rows, int maxFeatures);
bool calcOrientation(UMat &keypoints); bool calcOrientation(UMat &keypoints);
@ -75,7 +72,7 @@ protected:
int refcount; int refcount;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535) //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
UMat sum, mask1, maskSum, intBuffer; UMat sum, intBuffer;
UMat det, trace; UMat det, trace;
UMat maxPosBuffer; UMat maxPosBuffer;
@ -87,12 +84,11 @@ protected:
UMat img, counters; UMat img, counters;
// texture buffers // texture buffers
ocl::Image2D imgTex, sumTex, maskSumTex; ocl::Image2D imgTex, sumTex;
bool haveImageSupport; bool haveImageSupport;
String kerOpts;
int status; int status;
ocl::Kernel kerCalcDetTrace, kerFindMaxima, kerFindMaximaMask, kerInterp;
ocl::Kernel kerUpRight, kerOri, kerCalcDesc64, kerCalcDesc128, kerNormDesc64, kerNormDesc128;
}; };
/* /*

@ -54,20 +54,6 @@ namespace cv
enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) }; enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) };
/*static void openCLExecuteKernelSURF(Context2 *clCxt, const ProgramEntry* source, String kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
{
std::stringstream optsStr;
optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
size_t wave_size = queryWaveFrontSize(kernel);
CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
optsStr << "-D WAVE_SIZE=" << wave_size;
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
}*/
static inline int calcSize(int octave, int layer) static inline int calcSize(int octave, int layer)
{ {
/* Wavelet size at first layer of first octave. */ /* Wavelet size at first layer of first octave. */
@ -100,22 +86,11 @@ bool SURF_OCL::init(const SURF* p)
if(ocl::haveOpenCL()) if(ocl::haveOpenCL())
{ {
const ocl::Device& dev = ocl::Device::getDefault(); const ocl::Device& dev = ocl::Device::getDefault();
if( dev.type() == ocl::Device::TYPE_CPU ) if( dev.type() == ocl::Device::TYPE_CPU || dev.doubleFPConfig() == 0 )
return false; return false;
haveImageSupport = dev.imageSupport(); haveImageSupport = false;//dev.imageSupport();
String opts = haveImageSupport ? "-D DISABLE_IMAGE2D" : ""; kerOpts = haveImageSupport ? "-D HAVE_IMAGE2D -D DOUBLE_SUPPORT" : "";
status = 1;
if( kerCalcDetTrace.create("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, opts) &&
kerFindMaxima.create("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, opts) &&
kerFindMaximaMask.create("SURF_findMaximaInLayerWithMask", ocl::nonfree::surf_oclsrc, opts) &&
kerInterp.create("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, opts) &&
kerUpRight.create("SURF_setUpRight", ocl::nonfree::surf_oclsrc, opts) &&
kerOri.create("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, opts) &&
kerCalcDesc64.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, opts) &&
kerCalcDesc128.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, opts) &&
kerNormDesc64.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, opts) &&
kerNormDesc128.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, opts))
status = 1;
} }
} }
return status > 0; return status > 0;
@ -126,8 +101,10 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask)
{ {
if( status <= 0 ) if( status <= 0 )
return false; return false;
CV_Assert(!_img.empty() && _img.type() == CV_8UC1); if( !_mask.empty())
CV_Assert(_mask.empty() || (_mask.size() == _img.size() && _mask.type() == CV_8UC1)); return false;
int imgtype = _img.type();
CV_Assert(!_img.empty());
CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0); CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0);
int min_size = calcSize(params->nOctaves - 1, 0); int min_size = calcSize(params->nOctaves - 1, 0);
@ -151,10 +128,12 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask)
counters.setTo(Scalar::all(0)); counters.setTo(Scalar::all(0));
img.release(); img.release();
if(_img.isUMat()) if(_img.isUMat() && imgtype == CV_8UC1)
img = _img.getUMat(); img = _img.getUMat();
else else if( imgtype == CV_8UC1 )
_img.copyTo(img); _img.copyTo(img);
else
cvtColor(_img, img, COLOR_BGR2GRAY);
integral(img, sum); integral(img, sum);
@ -164,12 +143,6 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask)
sumTex = ocl::Image2D(sum); sumTex = ocl::Image2D(sum);
} }
maskSumTex = ocl::Image2D();
if(!_mask.empty())
{
CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet");
}
return true; return true;
} }
@ -191,11 +164,10 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints)
const int layer_rows = img_rows >> octave; const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave; const int layer_cols = img_cols >> octave;
if(!calcLayerDetAndTrace(det, trace, octave, layer_rows)) if(!calcLayerDetAndTrace(octave, layer_rows))
return false; return false;
if(!findMaximaInLayer(det, trace, maxPosBuffer, counters, 1 + octave, octave, if(!findMaximaInLayer(1 + octave, octave, layer_rows, layer_cols))
layer_rows, layer_cols))
return false; return false;
cpuCounters = counters.getMat(ACCESS_READ); cpuCounters = counters.getMat(ACCESS_READ);
@ -205,8 +177,7 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints)
if (maxCounter > 0) if (maxCounter > 0)
{ {
if(!interpolateKeypoint(det, maxPosBuffer, maxCounter, keypoints, if(!interpolateKeypoint(maxCounter, keypoints, octave, layer_rows, maxFeatures))
counters, octave, layer_rows, maxFeatures))
return false; return false;
} }
} }
@ -216,7 +187,7 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints)
featureCounter = std::min(featureCounter, maxFeatures); featureCounter = std::min(featureCounter, maxFeatures);
cpuCounters.release(); cpuCounters.release();
keypoints = UMat(keypoints, Rect(0, 0, featureCounter, 1)); keypoints = UMat(keypoints, Rect(0, 0, featureCounter, keypoints.rows));
if (params->upright) if (params->upright)
return setUpRight(keypoints); return setUpRight(keypoints);
@ -232,7 +203,8 @@ bool SURF_OCL::setUpRight(UMat &keypoints)
return true; return true;
size_t globalThreads[3] = {nFeatures, 1}; size_t globalThreads[3] = {nFeatures, 1};
return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, false); ocl::Kernel kerUpRight("SURF_setUpRight", ocl::nonfree::surf_oclsrc, kerOpts);
return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, true);
} }
bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors) bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors)
@ -255,14 +227,14 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor
if( descriptorSize == 64 ) if( descriptorSize == 64 )
{ {
kerCalcDesc = kerCalcDesc64; kerCalcDesc.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
kerNormDesc = kerNormDesc64; kerNormDesc.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
} }
else else
{ {
CV_Assert(descriptorSize == 128); CV_Assert(descriptorSize == 128);
kerCalcDesc = kerCalcDesc128; kerCalcDesc.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
kerNormDesc = kerNormDesc128; kerNormDesc.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
} }
size_t localThreads[] = {6, 6}; size_t localThreads[] = {6, 6};
@ -271,17 +243,19 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor
if(haveImageSupport) if(haveImageSupport)
{ {
kerCalcDesc.args(imgTex, kerCalcDesc.args(imgTex,
img_rows, img_cols,
ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::ReadOnlyNoSize(keypoints),
ocl::KernelArg::WriteOnlyNoSize(descriptors)); ocl::KernelArg::WriteOnlyNoSize(descriptors));
} }
else else
{ {
kerCalcDesc.args(ocl::KernelArg::ReadOnly(img), kerCalcDesc.args(ocl::KernelArg::ReadOnlyNoSize(img),
img_rows, img_cols,
ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::ReadOnlyNoSize(keypoints),
ocl::KernelArg::WriteOnlyNoSize(descriptors)); ocl::KernelArg::WriteOnlyNoSize(descriptors));
} }
if(!kerCalcDesc.run(2, globalThreads, localThreads, false)) if(!kerCalcDesc.run(2, globalThreads, localThreads, true))
return false; return false;
size_t localThreads_n[] = {descriptorSize, 1}; size_t localThreads_n[] = {descriptorSize, 1};
@ -290,7 +264,7 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor
globalThreads[0] = nFeatures * localThreads[0]; globalThreads[0] = nFeatures * localThreads[0];
globalThreads[1] = localThreads[1]; globalThreads[1] = localThreads[1];
bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)). bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)).
run(2, globalThreads_n, localThreads_n, false); run(2, globalThreads_n, localThreads_n, true);
if(ok && !_descriptors.isUMat()) if(ok && !_descriptors.isUMat())
descriptors.copyTo(_descriptors); descriptors.copyTo(_descriptors);
return ok; return ok;
@ -364,19 +338,19 @@ void SURF_OCL::downloadKeypoints(const UMat &keypointsGPU, std::vector<KeyPoint>
} }
} }
bool SURF_OCL::detect(InputArray img, InputArray mask, UMat& keypoints) bool SURF_OCL::detect(InputArray _img, InputArray _mask, UMat& keypoints)
{ {
if( !setImage(img, mask) ) if( !setImage(_img, _mask) )
return false; return false;
return detectKeypoints(keypoints); return detectKeypoints(keypoints);
} }
bool SURF_OCL::detectAndCompute(InputArray img, InputArray mask, UMat& keypoints, bool SURF_OCL::detectAndCompute(InputArray _img, InputArray _mask, UMat& keypoints,
OutputArray _descriptors, bool useProvidedKeypoints ) OutputArray _descriptors, bool useProvidedKeypoints )
{ {
if( !setImage(img, mask) ) if( !setImage(_img, _mask) )
return false; return false;
if( !useProvidedKeypoints && !detectKeypoints(keypoints) ) if( !useProvidedKeypoints && !detectKeypoints(keypoints) )
@ -389,22 +363,20 @@ inline int divUp(int a, int b) { return (a + b-1)/b; }
//////////////////////////// ////////////////////////////
// kernel caller definitions // kernel caller definitions
bool SURF_OCL::calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int c_layer_rows) bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows)
{ {
int nOctaveLayers = params->nOctaveLayers; int nOctaveLayers = params->nOctaveLayers;
const int min_size = calcSize(octave, 0); const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
String kernelName = "SURF_calcLayerDetAndTrace"; size_t localThreads[] = {16, 16};
std::vector< std::pair<size_t, const void *> > args; size_t globalThreads[] =
size_t localThreads[3] = {16, 16};
size_t globalThreads[3] =
{ {
divUp(max_samples_j, localThreads[0]) *localThreads[0], divUp(max_samples_j, localThreads[0]) *localThreads[0],
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2) divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2)
}; };
ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, kerOpts);
if(haveImageSupport) if(haveImageSupport)
{ {
kerCalcDetTrace.args(sumTex, kerCalcDetTrace.args(sumTex,
@ -421,56 +393,15 @@ bool SURF_OCL::calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int c_la
ocl::KernelArg::WriteOnlyNoSize(det), ocl::KernelArg::WriteOnlyNoSize(det),
ocl::KernelArg::WriteOnlyNoSize(trace)); ocl::KernelArg::WriteOnlyNoSize(trace));
} }
return kerCalcDetTrace.run(2, globalThreads, localThreads, false); return kerCalcDetTrace.run(2, globalThreads, localThreads, true);
} }
bool SURF_OCL::findMaximaInLayer(const UMat &det, const UMat &trace, bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave,
UMat &maxPosBuffer, UMat &maxCounter,
int counterOffset, int octave,
int layer_rows, int layer_cols) int layer_rows, int layer_cols)
{ {
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
bool haveMask = !maskSum.empty() || (maskSumTex.ptr() != 0);
int nOctaveLayers = params->nOctaveLayers; int nOctaveLayers = params->nOctaveLayers;
ocl::Kernel ker;
if( haveMask )
{
if( haveImageSupport )
ker = kerFindMaximaMask.args(maskSumTex,
ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::ReadOnlyNoSize(trace),
ocl::KernelArg::PtrReadWrite(maxPosBuffer),
ocl::KernelArg::PtrReadWrite(maxCounter),
counterOffset, img_rows, img_cols,
octave, nOctaveLayers,
layer_rows, layer_cols,
maxCandidates,
(float)params->hessianThreshold);
else
ker = kerFindMaximaMask.args(ocl::KernelArg::ReadOnlyNoSize(maskSum),
ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::ReadOnlyNoSize(trace),
ocl::KernelArg::PtrReadWrite(maxPosBuffer),
ocl::KernelArg::PtrReadWrite(maxCounter),
counterOffset, img_rows, img_cols,
octave, nOctaveLayers,
layer_rows, layer_cols,
maxCandidates,
(float)params->hessianThreshold);
}
else
{
ker = kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::ReadOnlyNoSize(trace),
ocl::KernelArg::PtrReadWrite(maxPosBuffer),
ocl::KernelArg::PtrReadWrite(maxCounter),
counterOffset, img_rows, img_cols,
octave, nOctaveLayers,
layer_rows, layer_cols,
maxCandidates,
(float)params->hessianThreshold);
}
size_t localThreads[3] = {16, 16}; size_t localThreads[3] = {16, 16};
size_t globalThreads[3] = size_t globalThreads[3] =
{ {
@ -478,21 +409,31 @@ bool SURF_OCL::findMaximaInLayer(const UMat &det, const UMat &trace,
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nOctaveLayers *localThreads[1] divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nOctaveLayers *localThreads[1]
}; };
return ker.run(2, globalThreads, localThreads, false); ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, kerOpts);
return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::ReadOnlyNoSize(trace),
ocl::KernelArg::PtrReadWrite(maxPosBuffer),
ocl::KernelArg::PtrReadWrite(counters),
counterOffset, img_rows, img_cols,
octave, nOctaveLayers,
layer_rows, layer_cols,
maxCandidates,
(float)params->hessianThreshold).run(2, globalThreads, localThreads, true);
} }
bool SURF_OCL::interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, bool SURF_OCL::interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int max_features)
UMat &keypoints, UMat &counters_, int octave, int layer_rows, int max_features)
{ {
size_t localThreads[3] = {3, 3, 3}; size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3}; size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3};
ocl::Kernel kerInterp("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, kerOpts);
return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det), return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::PtrReadOnly(maxPosBuffer), ocl::KernelArg::PtrReadOnly(maxPosBuffer),
ocl::KernelArg::ReadWriteNoSize(keypoints), ocl::KernelArg::ReadWriteNoSize(keypoints),
ocl::KernelArg::PtrReadWrite(counters_), ocl::KernelArg::PtrReadWrite(counters),
img_rows, img_cols, octave, layer_rows, max_features). img_rows, img_cols, octave, layer_rows, max_features).
run(3, globalThreads, localThreads, false); run(3, globalThreads, localThreads, true);
} }
bool SURF_OCL::calcOrientation(UMat &keypoints) bool SURF_OCL::calcOrientation(UMat &keypoints)
@ -500,18 +441,19 @@ bool SURF_OCL::calcOrientation(UMat &keypoints)
int nFeatures = keypoints.cols; int nFeatures = keypoints.cols;
if( nFeatures == 0 ) if( nFeatures == 0 )
return true; return true;
ocl::Kernel kerOri("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, kerOpts);
if( haveImageSupport ) if( haveImageSupport )
kerOri.args(sumTex, kerOri.args(sumTex, img_rows, img_cols,
ocl::KernelArg::ReadWriteNoSize(keypoints), ocl::KernelArg::ReadWriteNoSize(keypoints));
img_rows, img_cols);
else else
kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum), kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum),
ocl::KernelArg::ReadWriteNoSize(keypoints), img_rows, img_cols,
img_rows, img_cols); ocl::KernelArg::ReadWriteNoSize(keypoints));
size_t localThreads[3] = {ORI_LOCAL_SIZE, 1}; size_t localThreads[3] = {ORI_LOCAL_SIZE, 1};
size_t globalThreads[3] = {nFeatures * localThreads[0], 1}; size_t globalThreads[3] = {nFeatures * localThreads[0], 1};
return kerOri.run(2, globalThreads, localThreads, false); return kerOri.run(2, globalThreads, localThreads, true);
} }
} }

Loading…
Cancel
Save