|
|
@ -43,10 +43,39 @@ |
|
|
|
// |
|
|
|
// |
|
|
|
//M*/ |
|
|
|
//M*/ |
|
|
|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_amd_printf : enable |
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable |
|
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable |
|
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable |
|
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// specialized for non-image2d_t supported platform, intel HD4000, for example |
|
|
|
|
|
|
|
#ifdef DISABLE_IMAGE2D |
|
|
|
|
|
|
|
#define IMAGE_INT32 __global uint * |
|
|
|
|
|
|
|
#define IMAGE_INT8 __global uchar * |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
#define IMAGE_INT32 image2d_t |
|
|
|
|
|
|
|
#define IMAGE_INT8 image2d_t |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef DISABLE_IMAGE2D |
|
|
|
|
|
|
|
int x = clamp(coord.x, 0, cols); |
|
|
|
|
|
|
|
int y = clamp(coord.y, 0, rows); |
|
|
|
|
|
|
|
return img[elemPerRow * y + x]; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return read_imageui(img, sam, coord).x; |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef DISABLE_IMAGE2D |
|
|
|
|
|
|
|
int x = clamp(convert_int_rte(coord.x), 0, cols - 1); |
|
|
|
|
|
|
|
int y = clamp(convert_int_rte(coord.y), 0, rows - 1); |
|
|
|
|
|
|
|
return img[elemPerRow * y + x]; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return (uchar)read_imageui(img, sam, coord).x; |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// dynamically change the precision used for floating type |
|
|
|
// dynamically change the precision used for floating type |
|
|
|
|
|
|
|
|
|
|
|
#if defined (__ATI__) || defined (__NVIDIA__) |
|
|
|
#if defined (__ATI__) || defined (__NVIDIA__) |
|
|
@ -58,14 +87,24 @@ |
|
|
|
// Image read mode |
|
|
|
// Image read mode |
|
|
|
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; |
|
|
|
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef FLT_EPSILON |
|
|
|
#define FLT_EPSILON (1e-15) |
|
|
|
#define FLT_EPSILON (1e-15) |
|
|
|
#define CV_PI_F 3.14159265f |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef CV_PI_F |
|
|
|
|
|
|
|
#define CV_PI_F 3.14159265f |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// Use integral image to calculate haar wavelets. |
|
|
|
// Use integral image to calculate haar wavelets. |
|
|
|
// N = 2 |
|
|
|
// N = 2 |
|
|
|
// for simple haar paatern |
|
|
|
// for simple haar paatern |
|
|
|
float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x) |
|
|
|
float icvCalcHaarPatternSum_2( |
|
|
|
|
|
|
|
IMAGE_INT32 sumTex, |
|
|
|
|
|
|
|
__constant float src[2][5], |
|
|
|
|
|
|
|
int oldSize, |
|
|
|
|
|
|
|
int newSize, |
|
|
|
|
|
|
|
int y, int x, |
|
|
|
|
|
|
|
int rows, int cols, int elemPerRow) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
|
|
|
|
|
|
|
|
F t = 0; |
|
|
|
F t = 0; |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); |
|
|
|
|
|
|
|
|
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// N = 3 |
|
|
|
// N = 3 |
|
|
|
float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x) |
|
|
|
float icvCalcHaarPatternSum_3( |
|
|
|
|
|
|
|
IMAGE_INT32 sumTex, |
|
|
|
|
|
|
|
__constant float src[2][5], |
|
|
|
|
|
|
|
int oldSize, |
|
|
|
|
|
|
|
int newSize, |
|
|
|
|
|
|
|
int y, int x, |
|
|
|
|
|
|
|
int rows, int cols, int elemPerRow) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
|
|
|
|
|
|
|
|
F t = 0; |
|
|
|
F t = 0; |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); |
|
|
|
|
|
|
|
|
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// N = 4 |
|
|
|
// N = 4 |
|
|
|
float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x) |
|
|
|
float icvCalcHaarPatternSum_4( |
|
|
|
|
|
|
|
IMAGE_INT32 sumTex, |
|
|
|
|
|
|
|
__constant float src[2][5], |
|
|
|
|
|
|
|
int oldSize, |
|
|
|
|
|
|
|
int newSize, |
|
|
|
|
|
|
|
int y, int x, |
|
|
|
|
|
|
|
int rows, int cols, int elemPerRow) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
|
float ratio = (float)newSize / oldSize; |
|
|
@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
int dy2 = convert_int_rte(ratio * src[k][3]); |
|
|
|
|
|
|
|
|
|
|
|
F t = 0; |
|
|
|
F t = 0; |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); |
|
|
|
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x; |
|
|
|
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); |
|
|
|
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x; |
|
|
|
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); |
|
|
|
|
|
|
|
|
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer) |
|
|
|
|
|
|
|
|
|
|
|
//calculate targeted layer per-pixel determinant and trace with an integral image |
|
|
|
//calculate targeted layer per-pixel determinant and trace with an integral image |
|
|
|
__kernel void icvCalcLayerDetAndTrace( |
|
|
|
__kernel void icvCalcLayerDetAndTrace( |
|
|
|
image2d_t sumTex, // input integral image |
|
|
|
IMAGE_INT32 sumTex, // input integral image |
|
|
|
__global float * det, // output Determinant |
|
|
|
__global float * det, // output Determinant |
|
|
|
__global float * trace, // output trace |
|
|
|
__global float * trace, // output trace |
|
|
|
int det_step, // the step of det in bytes |
|
|
|
int det_step, // the step of det in bytes |
|
|
@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace( |
|
|
|
int c_img_cols, |
|
|
|
int c_img_cols, |
|
|
|
int c_nOctaveLayers, |
|
|
|
int c_nOctaveLayers, |
|
|
|
int c_octave, |
|
|
|
int c_octave, |
|
|
|
int c_layer_rows |
|
|
|
int c_layer_rows, |
|
|
|
|
|
|
|
int sumTex_step |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
det_step /= sizeof(*det); |
|
|
|
det_step /= sizeof(*det); |
|
|
|
trace_step /= sizeof(*trace); |
|
|
|
trace_step /= sizeof(*trace); |
|
|
|
|
|
|
|
sumTex_step/= sizeof(uint); |
|
|
|
// Determine the indices |
|
|
|
// Determine the indices |
|
|
|
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); |
|
|
|
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); |
|
|
|
const int blockIdx_y = get_group_id(1) % gridDim_y; |
|
|
|
const int blockIdx_y = get_group_id(1) % gridDim_y; |
|
|
@ -205,12 +255,12 @@ __kernel void icvCalcLayerDetAndTrace( |
|
|
|
|
|
|
|
|
|
|
|
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) |
|
|
|
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave); |
|
|
|
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); |
|
|
|
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave); |
|
|
|
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); |
|
|
|
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave); |
|
|
|
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); |
|
|
|
|
|
|
|
|
|
|
|
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; |
|
|
|
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; |
|
|
|
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; |
|
|
|
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace( |
|
|
|
|
|
|
|
|
|
|
|
__constant float c_DM[5] = {0, 0, 9, 9, 1}; |
|
|
|
__constant float c_DM[5] = {0, 0, 9, 9, 1}; |
|
|
|
|
|
|
|
|
|
|
|
bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) |
|
|
|
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step) |
|
|
|
{ |
|
|
|
{ |
|
|
|
float ratio = (float)size / 9.0f; |
|
|
|
float ratio = (float)size / 9.0f; |
|
|
|
|
|
|
|
|
|
|
@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) |
|
|
|
|
|
|
|
|
|
|
|
float t = 0; |
|
|
|
float t = 0; |
|
|
|
|
|
|
|
|
|
|
|
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x; |
|
|
|
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step); |
|
|
|
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x; |
|
|
|
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step); |
|
|
|
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x; |
|
|
|
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step); |
|
|
|
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x; |
|
|
|
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step); |
|
|
|
|
|
|
|
|
|
|
|
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); |
|
|
|
|
|
|
|
|
|
|
@ -246,9 +296,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size) |
|
|
|
// Non-maximal suppression to further filtering the candidates from previous step |
|
|
|
// Non-maximal suppression to further filtering the candidates from previous step |
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void icvFindMaximaInLayer_withmask( |
|
|
|
void icvFindMaximaInLayer_withmask( |
|
|
|
__global const float * det, |
|
|
|
__global const float * det, |
|
|
|
__global const float * trace, |
|
|
|
__global const float * trace, |
|
|
|
__global int4 * maxPosBuffer, |
|
|
|
__global int4 * maxPosBuffer, |
|
|
|
volatile __global unsigned int* maxCounter, |
|
|
|
volatile __global unsigned int* maxCounter, |
|
|
|
int counter_offset, |
|
|
|
int counter_offset, |
|
|
|
int det_step, // the step of det in bytes |
|
|
|
int det_step, // the step of det in bytes |
|
|
@ -261,7 +311,8 @@ __kernel |
|
|
|
int c_layer_cols, |
|
|
|
int c_layer_cols, |
|
|
|
int c_max_candidates, |
|
|
|
int c_max_candidates, |
|
|
|
float c_hessianThreshold, |
|
|
|
float c_hessianThreshold, |
|
|
|
image2d_t maskSumTex |
|
|
|
IMAGE_INT32 maskSumTex, |
|
|
|
|
|
|
|
int mask_step |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
volatile __local float N9[768]; // threads.x * threads.y * 3 |
|
|
|
volatile __local float N9[768]; // threads.x * threads.y * 3 |
|
|
@ -269,6 +320,7 @@ __kernel |
|
|
|
det_step /= sizeof(*det); |
|
|
|
det_step /= sizeof(*det); |
|
|
|
trace_step /= sizeof(*trace); |
|
|
|
trace_step /= sizeof(*trace); |
|
|
|
maxCounter += counter_offset; |
|
|
|
maxCounter += counter_offset; |
|
|
|
|
|
|
|
mask_step /= sizeof(uint); |
|
|
|
|
|
|
|
|
|
|
|
// Determine the indices |
|
|
|
// Determine the indices |
|
|
|
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers; |
|
|
|
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers; |
|
|
@ -288,26 +340,26 @@ __kernel |
|
|
|
// Is this thread within the hessian buffer? |
|
|
|
// Is this thread within the hessian buffer? |
|
|
|
const int zoff = get_local_size(0) * get_local_size(1); |
|
|
|
const int zoff = get_local_size(0) * get_local_size(1); |
|
|
|
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; |
|
|
|
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; |
|
|
|
N9[localLin - zoff] = |
|
|
|
N9[localLin - zoff] = |
|
|
|
det[det_step * |
|
|
|
det[det_step * |
|
|
|
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
N9[localLin ] = |
|
|
|
N9[localLin ] = |
|
|
|
det[det_step * |
|
|
|
det[det_step * |
|
|
|
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
N9[localLin + zoff] = |
|
|
|
N9[localLin + zoff] = |
|
|
|
det[det_step * |
|
|
|
det[det_step * |
|
|
|
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
+ min(max(j, 0), c_img_cols - 1)]; // x |
|
|
|
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
if (i < c_layer_rows - margin |
|
|
|
if (i < c_layer_rows - margin |
|
|
|
&& j < c_layer_cols - margin |
|
|
|
&& j < c_layer_cols - margin |
|
|
|
&& get_local_id(0) > 0 |
|
|
|
&& get_local_id(0) > 0 |
|
|
|
&& get_local_id(0) < get_local_size(0) - 1 |
|
|
|
&& get_local_id(0) < get_local_size(0) - 1 |
|
|
|
&& get_local_id(1) > 0 |
|
|
|
&& get_local_id(1) > 0 |
|
|
|
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA |
|
|
|
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -321,7 +373,7 @@ __kernel |
|
|
|
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; |
|
|
|
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; |
|
|
|
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; |
|
|
|
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; |
|
|
|
|
|
|
|
|
|
|
|
if (within_check(maskSumTex, sum_i, sum_j, size)) |
|
|
|
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step)) |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Check to see if we have a max (in its 26 neighbours) |
|
|
|
// Check to see if we have a max (in its 26 neighbours) |
|
|
|
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] |
|
|
|
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] |
|
|
@ -372,9 +424,9 @@ __kernel |
|
|
|
|
|
|
|
|
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void icvFindMaximaInLayer( |
|
|
|
void icvFindMaximaInLayer( |
|
|
|
__global float * det, |
|
|
|
__global float * det, |
|
|
|
__global float * trace, |
|
|
|
__global float * trace, |
|
|
|
__global int4 * maxPosBuffer, |
|
|
|
__global int4 * maxPosBuffer, |
|
|
|
volatile __global unsigned int* maxCounter, |
|
|
|
volatile __global unsigned int* maxCounter, |
|
|
|
int counter_offset, |
|
|
|
int counter_offset, |
|
|
|
int det_step, // the step of det in bytes |
|
|
|
int det_step, // the step of det in bytes |
|
|
@ -417,19 +469,19 @@ __kernel |
|
|
|
int l_x = min(max(j, 0), c_img_cols - 1); |
|
|
|
int l_x = min(max(j, 0), c_img_cols - 1); |
|
|
|
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); |
|
|
|
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); |
|
|
|
|
|
|
|
|
|
|
|
N9[localLin - zoff] = |
|
|
|
N9[localLin - zoff] = |
|
|
|
det[det_step * (l_y - c_layer_rows) + l_x]; |
|
|
|
det[det_step * (l_y - c_layer_rows) + l_x]; |
|
|
|
N9[localLin ] = |
|
|
|
N9[localLin ] = |
|
|
|
det[det_step * (l_y ) + l_x]; |
|
|
|
det[det_step * (l_y ) + l_x]; |
|
|
|
N9[localLin + zoff] = |
|
|
|
N9[localLin + zoff] = |
|
|
|
det[det_step * (l_y + c_layer_rows) + l_x]; |
|
|
|
det[det_step * (l_y + c_layer_rows) + l_x]; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
if (i < c_layer_rows - margin |
|
|
|
if (i < c_layer_rows - margin |
|
|
|
&& j < c_layer_cols - margin |
|
|
|
&& j < c_layer_cols - margin |
|
|
|
&& get_local_id(0) > 0 |
|
|
|
&& get_local_id(0) > 0 |
|
|
|
&& get_local_id(0) < get_local_size(0) - 1 |
|
|
|
&& get_local_id(0) < get_local_size(0) - 1 |
|
|
|
&& get_local_id(1) > 0 |
|
|
|
&& get_local_id(1) > 0 |
|
|
|
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA |
|
|
|
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -497,17 +549,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc |
|
|
|
{ |
|
|
|
{ |
|
|
|
F invdet = 1.0 / det; |
|
|
|
F invdet = 1.0 / det; |
|
|
|
|
|
|
|
|
|
|
|
x[0] = invdet * |
|
|
|
x[0] = invdet * |
|
|
|
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - |
|
|
|
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - |
|
|
|
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + |
|
|
|
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + |
|
|
|
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); |
|
|
|
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); |
|
|
|
|
|
|
|
|
|
|
|
x[1] = invdet * |
|
|
|
x[1] = invdet * |
|
|
|
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - |
|
|
|
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - |
|
|
|
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + |
|
|
|
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + |
|
|
|
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); |
|
|
|
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); |
|
|
|
|
|
|
|
|
|
|
|
x[2] = invdet * |
|
|
|
x[2] = invdet * |
|
|
|
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - |
|
|
|
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - |
|
|
|
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + |
|
|
|
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + |
|
|
|
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); |
|
|
|
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); |
|
|
@ -528,9 +580,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc |
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////// |
|
|
|
//////////////////////////////////////////////////////////////////////// |
|
|
|
// INTERPOLATION |
|
|
|
// INTERPOLATION |
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void icvInterpolateKeypoint( |
|
|
|
void icvInterpolateKeypoint( |
|
|
|
__global const float * det, |
|
|
|
__global const float * det, |
|
|
|
__global const int4 * maxPosBuffer, |
|
|
|
__global const int4 * maxPosBuffer, |
|
|
|
__global float * keypoints, |
|
|
|
__global float * keypoints, |
|
|
|
volatile __global unsigned int * featureCounter, |
|
|
|
volatile __global unsigned int * featureCounter, |
|
|
@ -560,7 +612,7 @@ __kernel |
|
|
|
|
|
|
|
|
|
|
|
volatile __local float N9[3][3][3]; |
|
|
|
volatile __local float N9[3][3][3]; |
|
|
|
|
|
|
|
|
|
|
|
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = |
|
|
|
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = |
|
|
|
det[det_step * (c_layer_rows * layer + i) + j]; |
|
|
|
det[det_step * (c_layer_rows * layer + i) + j]; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
@ -658,27 +710,27 @@ __kernel |
|
|
|
|
|
|
|
|
|
|
|
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; |
|
|
|
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; |
|
|
|
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; |
|
|
|
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; |
|
|
|
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, |
|
|
|
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, |
|
|
|
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, |
|
|
|
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, |
|
|
|
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, |
|
|
|
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, |
|
|
|
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, |
|
|
|
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, |
|
|
|
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, |
|
|
|
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, |
|
|
|
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, |
|
|
|
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, |
|
|
|
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, |
|
|
|
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, |
|
|
|
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, |
|
|
|
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, |
|
|
|
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, |
|
|
|
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, |
|
|
|
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, |
|
|
|
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, |
|
|
|
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, |
|
|
|
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, |
|
|
|
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, |
|
|
|
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, |
|
|
|
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, |
|
|
|
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, |
|
|
|
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, |
|
|
|
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, |
|
|
|
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, |
|
|
|
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, |
|
|
|
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, |
|
|
|
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, |
|
|
|
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, |
|
|
|
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, |
|
|
|
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, |
|
|
|
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, |
|
|
|
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, |
|
|
|
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, |
|
|
|
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, |
|
|
|
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, |
|
|
|
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, |
|
|
|
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, |
|
|
|
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, |
|
|
|
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, |
|
|
|
0.001707611023448408f, 0.001455130288377404f}; |
|
|
|
0.001707611023448408f, 0.001455130288377404f}; |
|
|
|
|
|
|
|
|
|
|
@ -691,27 +743,29 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int |
|
|
|
data[tid] = partial_reduction; |
|
|
|
data[tid] = partial_reduction; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
if (tid < 16) |
|
|
|
{ |
|
|
|
{ |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
|
|
|
} |
|
|
|
} |
|
|
|
#undef op |
|
|
|
#undef op |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void icvCalcOrientation( |
|
|
|
void icvCalcOrientation( |
|
|
|
image2d_t sumTex, |
|
|
|
IMAGE_INT32 sumTex, |
|
|
|
__global float * keypoints, |
|
|
|
__global float * keypoints, |
|
|
|
int keypoints_step, |
|
|
|
int keypoints_step, |
|
|
|
int c_img_rows, |
|
|
|
int c_img_rows, |
|
|
|
int c_img_cols |
|
|
|
int c_img_cols, |
|
|
|
|
|
|
|
int sum_step |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
keypoints_step /= sizeof(*keypoints); |
|
|
|
keypoints_step /= sizeof(*keypoints); |
|
|
|
|
|
|
|
sum_step /= sizeof(uint); |
|
|
|
__global float* featureX = keypoints + X_ROW * keypoints_step; |
|
|
|
__global float* featureX = keypoints + X_ROW * keypoints_step; |
|
|
|
__global float* featureY = keypoints + Y_ROW * keypoints_step; |
|
|
|
__global float* featureY = keypoints + Y_ROW * keypoints_step; |
|
|
|
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step; |
|
|
|
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step; |
|
|
@ -754,8 +808,8 @@ __kernel |
|
|
|
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && |
|
|
|
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && |
|
|
|
x >= 0 && x < (c_img_cols + 1) - grad_wav_size) |
|
|
|
x >= 0 && x < (c_img_cols + 1) - grad_wav_size) |
|
|
|
{ |
|
|
|
{ |
|
|
|
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x); |
|
|
|
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); |
|
|
|
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x); |
|
|
|
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); |
|
|
|
|
|
|
|
|
|
|
|
angle = atan2(Y, X); |
|
|
|
angle = atan2(Y, X); |
|
|
|
|
|
|
|
|
|
|
@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] = |
|
|
|
|
|
|
|
|
|
|
|
// utility for linear filter |
|
|
|
// utility for linear filter |
|
|
|
inline uchar readerGet( |
|
|
|
inline uchar readerGet( |
|
|
|
image2d_t src, |
|
|
|
IMAGE_INT8 src, |
|
|
|
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, |
|
|
|
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, |
|
|
|
int i, int j |
|
|
|
int i, int j, int rows, int cols, int elemPerRow |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; |
|
|
|
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; |
|
|
|
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; |
|
|
|
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; |
|
|
|
return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x; |
|
|
|
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline float linearFilter( |
|
|
|
inline float linearFilter( |
|
|
|
image2d_t src, |
|
|
|
IMAGE_INT8 src, |
|
|
|
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, |
|
|
|
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, |
|
|
|
float y, float x |
|
|
|
float y, float x, int rows, int cols, int elemPerRow |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
x -= 0.5f; |
|
|
|
x -= 0.5f; |
|
|
@ -907,30 +961,33 @@ inline float linearFilter( |
|
|
|
const int x2 = x1 + 1; |
|
|
|
const int x2 = x1 + 1; |
|
|
|
const int y2 = y1 + 1; |
|
|
|
const int y2 = y1 + 1; |
|
|
|
|
|
|
|
|
|
|
|
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1); |
|
|
|
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow); |
|
|
|
out = out + src_reg * ((x2 - x) * (y2 - y)); |
|
|
|
out = out + src_reg * ((x2 - x) * (y2 - y)); |
|
|
|
|
|
|
|
|
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2); |
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow); |
|
|
|
out = out + src_reg * ((x - x1) * (y2 - y)); |
|
|
|
out = out + src_reg * ((x - x1) * (y2 - y)); |
|
|
|
|
|
|
|
|
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1); |
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow); |
|
|
|
out = out + src_reg * ((x2 - x) * (y - y1)); |
|
|
|
out = out + src_reg * ((x2 - x) * (y - y1)); |
|
|
|
|
|
|
|
|
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2); |
|
|
|
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow); |
|
|
|
out = out + src_reg * ((x - x1) * (y - y1)); |
|
|
|
out = out + src_reg * ((x - x1) * (y - y1)); |
|
|
|
|
|
|
|
|
|
|
|
return out; |
|
|
|
return out; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void calc_dx_dy( |
|
|
|
void calc_dx_dy( |
|
|
|
image2d_t imgTex, |
|
|
|
IMAGE_INT8 imgTex, |
|
|
|
volatile __local float s_dx_bin[25], |
|
|
|
volatile __local float s_dx_bin[25], |
|
|
|
volatile __local float s_dy_bin[25], |
|
|
|
volatile __local float s_dy_bin[25], |
|
|
|
volatile __local float s_PATCH[6][6], |
|
|
|
volatile __local float s_PATCH[6][6], |
|
|
|
__global const float* featureX, |
|
|
|
__global const float* featureX, |
|
|
|
__global const float* featureY, |
|
|
|
__global const float* featureY, |
|
|
|
__global const float* featureSize, |
|
|
|
__global const float* featureSize, |
|
|
|
__global const float* featureDir |
|
|
|
__global const float* featureDir, |
|
|
|
|
|
|
|
int rows, |
|
|
|
|
|
|
|
int cols, |
|
|
|
|
|
|
|
int elemPerRow |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const float centerX = featureX[get_group_id(0)]; |
|
|
|
const float centerX = featureX[get_group_id(0)]; |
|
|
@ -965,7 +1022,7 @@ void calc_dx_dy( |
|
|
|
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; |
|
|
|
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; |
|
|
|
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; |
|
|
|
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; |
|
|
|
|
|
|
|
|
|
|
|
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo); |
|
|
|
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); |
|
|
|
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
@ -976,26 +1033,26 @@ void calc_dx_dy( |
|
|
|
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; |
|
|
|
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; |
|
|
|
|
|
|
|
|
|
|
|
const float vx = ( |
|
|
|
const float vx = ( |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) ] + |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) ] + |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) |
|
|
|
* dw; |
|
|
|
* dw; |
|
|
|
const float vy = ( |
|
|
|
const float vy = ( |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) ] + |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) ] + |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) |
|
|
|
s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) |
|
|
|
* dw; |
|
|
|
* dw; |
|
|
|
s_dx_bin[tid] = vx; |
|
|
|
s_dx_bin[tid] = vx; |
|
|
|
s_dy_bin[tid] = vy; |
|
|
|
s_dy_bin[tid] = vy; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
void reduce_sum25( |
|
|
|
void reduce_sum25( |
|
|
|
volatile __local float* sdata1, |
|
|
|
volatile __local float* sdata1, |
|
|
|
volatile __local float* sdata2, |
|
|
|
volatile __local float* sdata2, |
|
|
|
volatile __local float* sdata3, |
|
|
|
volatile __local float* sdata3, |
|
|
|
volatile __local float* sdata4, |
|
|
|
volatile __local float* sdata4, |
|
|
|
int tid |
|
|
|
int tid |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1033,18 +1090,20 @@ void reduce_sum25( |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void compute_descriptors64( |
|
|
|
void compute_descriptors64( |
|
|
|
image2d_t imgTex, |
|
|
|
IMAGE_INT8 imgTex, |
|
|
|
volatile __global float * descriptors, |
|
|
|
volatile __global float * descriptors, |
|
|
|
__global const float * keypoints, |
|
|
|
__global const float * keypoints, |
|
|
|
int descriptors_step, |
|
|
|
int descriptors_step, |
|
|
|
int keypoints_step |
|
|
|
int keypoints_step, |
|
|
|
|
|
|
|
int rows, |
|
|
|
|
|
|
|
int cols, |
|
|
|
|
|
|
|
int img_step |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
descriptors_step /= sizeof(float); |
|
|
|
descriptors_step /= sizeof(float); |
|
|
|
keypoints_step /= sizeof(float); |
|
|
|
keypoints_step /= sizeof(float); |
|
|
|
|
|
|
|
|
|
|
|
__global const float * featureX = keypoints + X_ROW * keypoints_step; |
|
|
|
__global const float * featureX = keypoints + X_ROW * keypoints_step; |
|
|
|
__global const float * featureY = keypoints + Y_ROW * keypoints_step; |
|
|
|
__global const float * featureY = keypoints + Y_ROW * keypoints_step; |
|
|
|
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step; |
|
|
|
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step; |
|
|
@ -1057,7 +1116,7 @@ __kernel |
|
|
|
volatile __local float sdyabs[25]; |
|
|
|
volatile __local float sdyabs[25]; |
|
|
|
volatile __local float s_PATCH[6][6]; |
|
|
|
volatile __local float s_PATCH[6][6]; |
|
|
|
|
|
|
|
|
|
|
|
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); |
|
|
|
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); |
|
|
|
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); |
|
|
@ -1066,10 +1125,10 @@ __kernel |
|
|
|
{ |
|
|
|
{ |
|
|
|
sdxabs[tid] = fabs(sdx[tid]); // |dx| array |
|
|
|
sdxabs[tid] = fabs(sdx[tid]); // |dx| array |
|
|
|
sdyabs[tid] = fabs(sdy[tid]); // |dy| array |
|
|
|
sdyabs[tid] = fabs(sdy[tid]); // |dy| array |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); |
|
|
|
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); |
|
|
|
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); |
|
|
|
|
|
|
|
|
|
|
@ -1083,13 +1142,16 @@ __kernel |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void compute_descriptors128( |
|
|
|
void compute_descriptors128( |
|
|
|
image2d_t imgTex, |
|
|
|
IMAGE_INT8 imgTex, |
|
|
|
__global volatile float * descriptors, |
|
|
|
__global volatile float * descriptors, |
|
|
|
__global float * keypoints, |
|
|
|
__global float * keypoints, |
|
|
|
int descriptors_step, |
|
|
|
int descriptors_step, |
|
|
|
int keypoints_step |
|
|
|
int keypoints_step, |
|
|
|
|
|
|
|
int rows, |
|
|
|
|
|
|
|
int cols, |
|
|
|
|
|
|
|
int img_step |
|
|
|
) |
|
|
|
) |
|
|
|
{ |
|
|
|
{ |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
@ -1111,7 +1173,7 @@ __kernel |
|
|
|
volatile __local float sdabs2[25]; |
|
|
|
volatile __local float sdabs2[25]; |
|
|
|
volatile __local float s_PATCH[6][6]; |
|
|
|
volatile __local float s_PATCH[6][6]; |
|
|
|
|
|
|
|
|
|
|
|
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); |
|
|
|
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); |
|
|
|
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); |
|
|
@ -1132,10 +1194,10 @@ __kernel |
|
|
|
sd2[tid] = sdx[tid]; |
|
|
|
sd2[tid] = sdx[tid]; |
|
|
|
sdabs2[tid] = fabs(sdx[tid]); |
|
|
|
sdabs2[tid] = fabs(sdx[tid]); |
|
|
|
} |
|
|
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); |
|
|
|
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); |
|
|
|
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); |
|
|
|
|
|
|
|
|
|
|
@ -1162,10 +1224,10 @@ __kernel |
|
|
|
sd2[tid] = sdy[tid]; |
|
|
|
sd2[tid] = sdy[tid]; |
|
|
|
sdabs2[tid] = fabs(sdy[tid]); |
|
|
|
sdabs2[tid] = fabs(sdy[tid]); |
|
|
|
} |
|
|
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); |
|
|
|
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
//barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) |
|
|
|
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) |
|
|
|
if (tid == 0) |
|
|
|
if (tid == 0) |
|
|
@ -1178,7 +1240,7 @@ __kernel |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void normalize_descriptors128(__global float * descriptors, int descriptors_step) |
|
|
|
void normalize_descriptors128(__global float * descriptors, int descriptors_step) |
|
|
|
{ |
|
|
|
{ |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
@ -1219,7 +1281,7 @@ __kernel |
|
|
|
// normalize and store in output |
|
|
|
// normalize and store in output |
|
|
|
descriptor_base[get_local_id(0)] = lookup / len; |
|
|
|
descriptor_base[get_local_id(0)] = lookup / len; |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel |
|
|
|
__kernel |
|
|
|
void normalize_descriptors64(__global float * descriptors, int descriptors_step) |
|
|
|
void normalize_descriptors64(__global float * descriptors, int descriptors_step) |
|
|
|
{ |
|
|
|
{ |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
|
descriptors_step /= sizeof(*descriptors); |
|
|
|