diff --git a/modules/ocl/src/kernels/nonfree_surf.cl b/modules/ocl/src/kernels/nonfree_surf.cl index e51b2d09ad..5916b2557c 100644 --- a/modules/ocl/src/kernels/nonfree_surf.cl +++ b/modules/ocl/src/kernels/nonfree_surf.cl @@ -43,10 +43,39 @@ // //M*/ -#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +// specialized for non-image2d_t supported platform, intel HD4000, for example +#ifdef DISABLE_IMAGE2D +#define IMAGE_INT32 __global uint * +#define IMAGE_INT8 __global uchar * +#else +#define IMAGE_INT32 image2d_t +#define IMAGE_INT8 image2d_t +#endif + +uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow) +{ +#ifdef DISABLE_IMAGE2D + int x = clamp(coord.x, 0, cols); + int y = clamp(coord.y, 0, rows); + return img[elemPerRow * y + x]; +#else + return read_imageui(img, sam, coord).x; +#endif +} +uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow) +{ +#ifdef DISABLE_IMAGE2D + int x = clamp(convert_int_rte(coord.x), 0, cols - 1); + int y = clamp(convert_int_rte(coord.y), 0, rows - 1); + return img[elemPerRow * y + x]; +#else + return (uchar)read_imageui(img, sam, coord).x; +#endif +} + // dynamically change the precision used for floating type #if defined (__ATI__) || defined (__NVIDIA__) @@ -58,14 +87,24 @@ // Image read mode __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; +#ifndef FLT_EPSILON #define FLT_EPSILON (1e-15) -#define CV_PI_F 3.14159265f +#endif +#ifndef CV_PI_F +#define CV_PI_F 3.14159265f +#endif // Use integral image to calculate haar wavelets. // N = 2 // for simple haar paatern -float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x) +float icvCalcHaarPatternSum_2( + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, + int rows, int cols, int elemPerRow) { float ratio = (float)newSize / oldSize; @@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int int dy2 = convert_int_rte(ratio * src[k][3]); F t = 0; - t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; - t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; - + t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); } @@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int } // N = 3 -float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x) +float icvCalcHaarPatternSum_3( + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, + int rows, int cols, int elemPerRow) { float ratio = (float)newSize / oldSize; @@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int int dy2 = convert_int_rte(ratio * src[k][3]); F t = 0; - t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; - t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; - + t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); } @@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int } // N = 4 -float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x) +float icvCalcHaarPatternSum_4( + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, + int rows, int cols, int elemPerRow) { float ratio = (float)newSize / oldSize; @@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int int dy2 = convert_int_rte(ratio * src[k][3]); F t = 0; - t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; - t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; - t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; - + t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); } @@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer) //calculate targeted layer per-pixel determinant and trace with an integral image __kernel void icvCalcLayerDetAndTrace( - image2d_t sumTex, // input integral image + IMAGE_INT32 sumTex, // input integral image __global float * det, // output Determinant __global float * trace, // output trace int det_step, // the step of det in bytes @@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace( int c_img_cols, int c_nOctaveLayers, int c_octave, - int c_layer_rows + int c_layer_rows, + int sumTex_step ) { det_step /= sizeof(*det); trace_step /= sizeof(*trace); + sumTex_step/= sizeof(uint); // Determine the indices const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); const int blockIdx_y = get_group_id(1) % gridDim_y; @@ -205,12 +255,12 @@ __kernel void icvCalcLayerDetAndTrace( if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave); - const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave); - const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave); + const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); + const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); + const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; - trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; + trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; } } @@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace( __constant float c_DM[5] = {0, 0, 9, 9, 1}; -bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) +bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step) { float ratio = (float)size / 9.0f; @@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) float t = 0; - t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x; - t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x; - t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x; - t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x; + t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step); + t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step); + t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step); + t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); @@ -246,9 +296,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) // Non-maximal suppression to further filtering the candidates from previous step __kernel void icvFindMaximaInLayer_withmask( - __global const float * det, - __global const float * trace, - __global int4 * maxPosBuffer, + __global const float * det, + __global const float * trace, + __global int4 * maxPosBuffer, volatile __global unsigned int* maxCounter, int counter_offset, int det_step, // the step of det in bytes @@ -261,7 +311,8 @@ __kernel int c_layer_cols, int c_max_candidates, float c_hessianThreshold, - image2d_t maskSumTex + IMAGE_INT32 maskSumTex, + int mask_step ) { volatile __local float N9[768]; // threads.x * threads.y * 3 @@ -269,6 +320,7 @@ __kernel det_step /= sizeof(*det); trace_step /= sizeof(*trace); maxCounter += counter_offset; + mask_step /= sizeof(uint); // Determine the indices const int gridDim_y = get_num_groups(1) / c_nOctaveLayers; @@ -288,26 +340,26 @@ __kernel // Is this thread within the hessian buffer? const int zoff = get_local_size(0) * get_local_size(1); const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; - N9[localLin - zoff] = - det[det_step * + N9[localLin - zoff] = + det[det_step * (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin ] = - det[det_step * + N9[localLin ] = + det[det_step * (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin + zoff] = - det[det_step * + N9[localLin + zoff] = + det[det_step * (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x barrier(CLK_LOCAL_MEM_FENCE); - if (i < c_layer_rows - margin + if (i < c_layer_rows - margin && j < c_layer_cols - margin - && get_local_id(0) > 0 + && get_local_id(0) > 0 && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 + && get_local_id(1) > 0 && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA ) { @@ -321,7 +373,7 @@ __kernel const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (within_check(maskSumTex, sum_i, sum_j, size)) + if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step)) { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] @@ -372,9 +424,9 @@ __kernel __kernel void icvFindMaximaInLayer( - __global float * det, - __global float * trace, - __global int4 * maxPosBuffer, + __global float * det, + __global float * trace, + __global int4 * maxPosBuffer, volatile __global unsigned int* maxCounter, int counter_offset, int det_step, // the step of det in bytes @@ -417,19 +469,19 @@ __kernel int l_x = min(max(j, 0), c_img_cols - 1); int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); - N9[localLin - zoff] = + N9[localLin - zoff] = det[det_step * (l_y - c_layer_rows) + l_x]; - N9[localLin ] = + N9[localLin ] = det[det_step * (l_y ) + l_x]; - N9[localLin + zoff] = + N9[localLin + zoff] = det[det_step * (l_y + c_layer_rows) + l_x]; barrier(CLK_LOCAL_MEM_FENCE); - if (i < c_layer_rows - margin + if (i < c_layer_rows - margin && j < c_layer_cols - margin - && get_local_id(0) > 0 + && get_local_id(0) > 0 && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 + && get_local_id(1) > 0 && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA ) { @@ -497,17 +549,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc { F invdet = 1.0 / det; - x[0] = invdet * + x[0] = invdet * (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); - x[1] = invdet * + x[1] = invdet * (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); - x[2] = invdet * + x[2] = invdet * (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); @@ -528,9 +580,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc //////////////////////////////////////////////////////////////////////// // INTERPOLATION -__kernel +__kernel void icvInterpolateKeypoint( - __global const float * det, + __global const float * det, __global const int4 * maxPosBuffer, __global float * keypoints, volatile __global unsigned int * featureCounter, @@ -560,7 +612,7 @@ __kernel volatile __local float N9[3][3][3]; - N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = + N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = det[det_step * (c_layer_rows * layer + i) + j]; barrier(CLK_LOCAL_MEM_FENCE); @@ -658,27 +710,27 @@ __kernel __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; -__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, - 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, - 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, - 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, - 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, - 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, - 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, - 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, - 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, - 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, - 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, - 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, - 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, - 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, +__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, + 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, + 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, + 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, + 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, + 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, + 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, + 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, + 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, + 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, + 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, + 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, + 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, + 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, - 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, - 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, - 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, + 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, + 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, + 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, - 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, - 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, + 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, + 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f}; @@ -691,27 +743,29 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int data[tid] = partial_reduction; barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) + if (tid < 16) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); } #undef op } __kernel void icvCalcOrientation( - image2d_t sumTex, + IMAGE_INT32 sumTex, __global float * keypoints, int keypoints_step, int c_img_rows, - int c_img_cols + int c_img_cols, + int sum_step ) { keypoints_step /= sizeof(*keypoints); + sum_step /= sizeof(uint); __global float* featureX = keypoints + X_ROW * keypoints_step; __global float* featureY = keypoints + Y_ROW * keypoints_step; __global float* featureSize = keypoints + SIZE_ROW * keypoints_step; @@ -754,8 +808,8 @@ __kernel if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && x >= 0 && x < (c_img_cols + 1) - grad_wav_size) { - X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x); - Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x); + X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); + Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); angle = atan2(Y, X); @@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] = // utility for linear filter inline uchar readerGet( - image2d_t src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, - int i, int j + IMAGE_INT8 src, + const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, + int i, int j, int rows, int cols, int elemPerRow ) { float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; - return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x; + return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow); } inline float linearFilter( - image2d_t src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, - float y, float x + IMAGE_INT8 src, + const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, + float y, float x, int rows, int cols, int elemPerRow ) { x -= 0.5f; @@ -907,30 +961,33 @@ inline float linearFilter( const int x2 = x1 + 1; const int y2 = y1 + 1; - uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1); + uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow); out = out + src_reg * ((x2 - x) * (y2 - y)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2); + src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow); out = out + src_reg * ((x - x1) * (y2 - y)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1); + src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow); out = out + src_reg * ((x2 - x) * (y - y1)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2); + src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow); out = out + src_reg * ((x - x1) * (y - y1)); return out; } void calc_dx_dy( - image2d_t imgTex, + IMAGE_INT8 imgTex, volatile __local float s_dx_bin[25], volatile __local float s_dy_bin[25], volatile __local float s_PATCH[6][6], - __global const float* featureX, - __global const float* featureY, - __global const float* featureSize, - __global const float* featureDir + __global const float* featureX, + __global const float* featureY, + __global const float* featureSize, + __global const float* featureDir, + int rows, + int cols, + int elemPerRow ) { const float centerX = featureX[get_group_id(0)]; @@ -965,7 +1022,7 @@ void calc_dx_dy( const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; - s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo); + s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); barrier(CLK_LOCAL_MEM_FENCE); @@ -976,26 +1033,26 @@ void calc_dx_dy( const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float vx = ( - s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) + s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) ][get_local_id(0) ] + + s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) * dw; const float vy = ( - s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) + s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - + s_PATCH[get_local_id(1) ][get_local_id(0) ] + + s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) * dw; s_dx_bin[tid] = vx; s_dy_bin[tid] = vy; } } void reduce_sum25( - volatile __local float* sdata1, - volatile __local float* sdata2, - volatile __local float* sdata3, - volatile __local float* sdata4, + volatile __local float* sdata1, + volatile __local float* sdata2, + volatile __local float* sdata3, + volatile __local float* sdata4, int tid ) { @@ -1033,18 +1090,20 @@ void reduce_sum25( } } -__kernel +__kernel void compute_descriptors64( - image2d_t imgTex, - volatile __global float * descriptors, + IMAGE_INT8 imgTex, + volatile __global float * descriptors, __global const float * keypoints, int descriptors_step, - int keypoints_step + int keypoints_step, + int rows, + int cols, + int img_step ) { descriptors_step /= sizeof(float); keypoints_step /= sizeof(float); - __global const float * featureX = keypoints + X_ROW * keypoints_step; __global const float * featureY = keypoints + Y_ROW * keypoints_step; __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step; @@ -1057,7 +1116,7 @@ __kernel volatile __local float sdyabs[25]; volatile __local float s_PATCH[6][6]; - calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); + calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); barrier(CLK_LOCAL_MEM_FENCE); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); @@ -1066,10 +1125,10 @@ __kernel { sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); @@ -1083,13 +1142,16 @@ __kernel } } } -__kernel +__kernel void compute_descriptors128( - image2d_t imgTex, - __global volatile float * descriptors, + IMAGE_INT8 imgTex, + __global volatile float * descriptors, __global float * keypoints, int descriptors_step, - int keypoints_step + int keypoints_step, + int rows, + int cols, + int img_step ) { descriptors_step /= sizeof(*descriptors); @@ -1111,7 +1173,7 @@ __kernel volatile __local float sdabs2[25]; volatile __local float s_PATCH[6][6]; - calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); + calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); barrier(CLK_LOCAL_MEM_FENCE); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); @@ -1132,10 +1194,10 @@ __kernel sd2[tid] = sdx[tid]; sdabs2[tid] = fabs(sdx[tid]); } - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); @@ -1162,10 +1224,10 @@ __kernel sd2[tid] = sdy[tid]; sdabs2[tid] = fabs(sdy[tid]); } - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) if (tid == 0) @@ -1178,7 +1240,7 @@ __kernel } } -__kernel +__kernel void normalize_descriptors128(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); @@ -1219,7 +1281,7 @@ __kernel // normalize and store in output descriptor_base[get_local_id(0)] = lookup / len; } -__kernel +__kernel void normalize_descriptors64(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp index dfbf7b1bbb..2c132396da 100644 --- a/modules/ocl/src/mcwutil.cpp +++ b/modules/ocl/src/mcwutil.cpp @@ -221,6 +221,36 @@ namespace cv { openCLFree(texture); } + + bool support_image2d(Context *clCxt) + { + static const char * _kernel_string = "__kernel void test_func(image2d_t img) {}"; + static bool _isTested = false; + static bool _support = false; + if(_isTested) + { + return _support; + } + try + { + cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func"); + _support = true; + } + catch (const cv::Exception& e) + { + if(e.code == -217) + { + _support = false; + } + else + { + // throw e once again + throw e; + } + } + _isTested = true; + return _support; + } }//namespace ocl }//namespace cv diff --git a/modules/ocl/src/mcwutil.hpp b/modules/ocl/src/mcwutil.hpp index d1986b93bd..7f2745111c 100644 --- a/modules/ocl/src/mcwutil.hpp +++ b/modules/ocl/src/mcwutil.hpp @@ -70,6 +70,10 @@ namespace cv // 2. for faster clamping, there is no buffer padding for the constructed texture cl_mem bindTexture(const oclMat &mat); void releaseTexture(cl_mem& texture); + + // returns whether the current context supports image2d_t format or not + bool support_image2d(Context *clCxt = Context::getContext()); + }//namespace ocl }//namespace cv diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index d4dbfd5061..9214406fd5 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -574,8 +574,9 @@ static void lkSparse_run(oclMat &I, oclMat &J, Context *clCxt = I.clCxt; int elemCntPerRow = I.step / I.elemSize(); string kernelName = "lkSparse"; - size_t localThreads[3] = { 8, 8, 1 }; - size_t globalThreads[3] = { 8 * ptcount, 8, 1}; + bool isImageSupported = support_image2d(); + size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 }; + size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1}; int cn = I.oclchannels(); char calcErr; if (level == 0) @@ -588,8 +589,9 @@ static void lkSparse_run(oclMat &I, oclMat &J, } vector > args; - cl_mem ITex = bindTexture(I); - cl_mem JTex = bindTexture(J); + + cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data; + cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data; args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex )); @@ -602,6 +604,8 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&level )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); + if (!isImageSupported) + args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); args.push_back( make_pair( sizeof(cl_int), (void *)&cn )); @@ -610,19 +614,14 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - try + if(isImageSupported) { openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); - } - catch(Exception&) - { - printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); releaseTexture(ITex); releaseTexture(JTex); - ITex = (cl_mem)I.data; - JTex = (cl_mem)J.data; - localThreads[1] = globalThreads[1] = 32; - args.insert( args.begin()+11, make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); + } + else + { openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); } } @@ -724,7 +723,7 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) { Context *clCxt = I.clCxt; - bool isImageSupported = clCxt->impl->devName.find("Intel(R) HD Graphics") == string::npos; + bool isImageSupported = support_image2d(); int elemCntPerRow = I.step / I.elemSize(); string kernelName = "lkDense"; diff --git a/modules/ocl/src/surf.cpp b/modules/ocl/src/surf.cpp index 65dc86d167..2e06f4439f 100644 --- a/modules/ocl/src/surf.cpp +++ b/modules/ocl/src/surf.cpp @@ -1,4 +1,4 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// +/*M///////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // @@ -44,6 +44,7 @@ //M*/ #include #include "precomp.hpp" +#include "mcwutil.hpp" //#include "opencv2/highgui/highgui.hpp" using namespace cv; @@ -71,7 +72,7 @@ static inline int calcSize(int octave, int layer) /* Wavelet size increment between layers. This should be an even number, such that the wavelet sizes in an octave are either all even or all odd. - This ensures that when looking for the neighbours of a sample, the layers + This ensures that when looking for the neighbors of a sample, the layers above and below are aligned correctly. */ const int HAAR_SIZE_INC = 6; @@ -79,6 +80,11 @@ static inline int calcSize(int octave, int layer) return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } +namespace +{ + const char* noImage2dOption = "-D DISABLE_IMAGE2D"; +} + class SURF_OCL_Invoker { public: @@ -88,7 +94,7 @@ public: //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold); //void loadOctaveConstants(int octave, int layer_rows, int layer_cols); - // kernel callers declearations + // kernel callers declarations void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows); void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, @@ -100,14 +106,14 @@ public: void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures); void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures); - // end of kernel callers declearations + // end of kernel callers declarations SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) : surf_(surf), img_cols(img.cols), img_rows(img.rows), - use_mask(!mask.empty()), - imgTex(NULL), sumTex(NULL), maskSumTex(NULL) + use_mask(!mask.empty()), counters(oclMat()), + imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img) { CV_Assert(!img.empty() && img.type() == CV_8UC1); CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); @@ -131,12 +137,13 @@ public: counters.create(1, surf_.nOctaves + 1, CV_32SC1); counters.setTo(Scalar::all(0)); - //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast(surf_.hessianThreshold)); - - bindImgTex(img, imgTex); - integral(img, surf_.sum); // the two argumented integral version is incorrect + integral(img, surf_.sum); + if(support_image2d()) + { + bindImgTex(img, imgTex); + bindImgTex(surf_.sum, sumTex); + } - bindImgTex(surf_.sum, sumTex); maskSumTex = 0; if (use_mask) @@ -155,7 +162,7 @@ public: void detectKeypoints(oclMat &keypoints) { // create image pyramid buffers - // different layers have same sized buffers, but they are sampled from gaussin kernel. + // different layers have same sized buffers, but they are sampled from Gaussian kernel. ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det); ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace); @@ -222,7 +229,6 @@ public: openCLFree(sumTex); if(maskSumTex) openCLFree(maskSumTex); - additioalParamBuffer.release(); } private: @@ -236,13 +242,13 @@ private: int maxFeatures; oclMat counters; - + // texture buffers cl_mem imgTex; cl_mem sumTex; cl_mem maskSumTex; - oclMat additioalParamBuffer; + const oclMat _img; // make a copy for non-image2d_t supported platform SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right) { @@ -362,11 +368,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat { if (!img.empty()) { - if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) - { - cout << " Intel HD GPU device unsupported " << endl; - return; - } SURF_OCL_Invoker surf(*this, img, mask); surf.detectKeypoints(keypoints); @@ -378,11 +379,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat { if (!img.empty()) { - if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) - { - cout << " Intel HD GPU device unsupported " << endl; - return; - } SURF_OCL_Invoker surf(*this, img, mask); if (!useProvidedKeypoints) @@ -443,74 +439,11 @@ void cv::ocl::SURF_OCL::releaseMemory() // bind source buffer to image oject. void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture) { - cl_image_format format; - int err; - int depth = img.depth(); - int channels = img.channels(); - - switch(depth) - { - case CV_8U: - format.image_channel_data_type = CL_UNSIGNED_INT8; - break; - case CV_32S: - format.image_channel_data_type = CL_UNSIGNED_INT32; - break; - case CV_32F: - format.image_channel_data_type = CL_FLOAT; - break; - default: - throw std::exception(); - break; - } - switch(channels) - { - case 1: - format.image_channel_order = CL_R; - break; - case 3: - format.image_channel_order = CL_RGB; - break; - case 4: - format.image_channel_order = CL_RGBA; - break; - default: - throw std::exception(); - break; - } if(texture) { openCLFree(texture); } - -#ifdef CL_VERSION_1_2 - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = img.step / img.elemSize(); - desc.image_height = img.rows; - desc.image_depth = 0; - desc.image_array_size = 1; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.buffer = NULL; - desc.num_mip_levels = 0; - desc.num_samples = 0; - texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); -#else - texture = clCreateImage2D( - Context::getContext()->impl->clContext, - CL_MEM_READ_WRITE, - &format, - img.step / img.elemSize(), - img.rows, - 0, - NULL, - &err); -#endif - size_t origin[] = { 0, 0, 0 }; - size_t region[] = { img.step / img.elemSize(), img.rows, 1 }; - clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0); - openCLSafeCall(err); + texture = bindTexture(img); } //////////////////////////// @@ -525,7 +458,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i string kernelName = "icvCalcLayerDetAndTrace"; vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); + if(sumTex) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); + } + else + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported + } args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&det.step)); @@ -535,6 +475,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers)); args.push_back( make_pair( sizeof(cl_int), (void *)&octave)); args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = @@ -543,7 +484,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2), 1 }; - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, @@ -573,16 +521,30 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat if(use_mask) { - args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex)); + if(maskSumTex) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex)); + } + else + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data)); + } + args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step)); } - size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0], divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1], 1 }; - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter, @@ -607,7 +569,14 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa size_t localThreads[3] = {3, 3, 3}; size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) @@ -617,16 +586,31 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); + if(sumTex) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); + } + else + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported + } args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); size_t localThreads[3] = {32, 4, 1}; size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures) @@ -649,12 +633,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const globalThreads[1] = 16 * localThreads[1]; args.clear(); - args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); + if(imgTex) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); + } + else + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data)); + } args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } kernelName = "normalize_descriptors64"; @@ -667,7 +668,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.clear(); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } else { @@ -680,12 +688,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const globalThreads[1] = 16 * localThreads[1]; args.clear(); - args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); + if(imgTex) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); + } + else + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data)); + } args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } kernelName = "normalize_descriptors128"; @@ -698,7 +723,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.clear(); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + if(support_image2d()) + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); + } + else + { + openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); + } } }