From fd77a49e76d667182629d42b32adcb44f8632a35 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Thu, 1 Aug 2013 13:06:33 +0800 Subject: [PATCH] Fix ocl compilation error when using Intel OpenCL SDK. --- modules/nonfree/src/opencl/surf.cl | 483 +++++++++++++++-------------- 1 file changed, 255 insertions(+), 228 deletions(-) diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index 140a4d746c..3dced5ea10 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -16,6 +16,7 @@ // // @Authors // Peng Xiao, pengxiao@multicorewareinc.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -43,9 +44,6 @@ // //M*/ -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - // specialized for non-image2d_t supported platform, intel HD4000, for example #ifdef DISABLE_IMAGE2D #define IMAGE_INT32 __global uint * @@ -105,7 +103,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM // for simple haar paatern float icvCalcHaarPatternSum_2( IMAGE_INT32 sumTex, - __constant float src[2][5], + __constant float2 *src, int oldSize, int newSize, int y, int x, @@ -116,21 +114,24 @@ float icvCalcHaarPatternSum_2( F d = 0; -#pragma unroll - for (int k = 0; k < 2; ++k) - { - int dx1 = convert_int_rte(ratio * src[k][0]); - int dy1 = convert_int_rte(ratio * src[k][1]); - int dx2 = convert_int_rte(ratio * src[k][2]); - int dy2 = convert_int_rte(ratio * src[k][3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); - d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); - } + int2 dx1 = convert_int2_rte(ratio * src[0]); + int2 dy1 = convert_int2_rte(ratio * src[1]); + int2 dx2 = convert_int2_rte(ratio * src[2]); + int2 dy2 = convert_int2_rte(ratio * src[3]); + + F t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); + d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); + d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); return (float)d; } @@ -138,7 +139,7 @@ float icvCalcHaarPatternSum_2( // N = 3 float icvCalcHaarPatternSum_3( IMAGE_INT32 sumTex, - __constant float src[2][5], + __constant float4 *src, int oldSize, int newSize, int y, int x, @@ -149,21 +150,31 @@ float icvCalcHaarPatternSum_3( F d = 0; -#pragma unroll - for (int k = 0; k < 3; ++k) - { - int dx1 = convert_int_rte(ratio * src[k][0]); - int dy1 = convert_int_rte(ratio * src[k][1]); - int dx2 = convert_int_rte(ratio * src[k][2]); - int dy2 = convert_int_rte(ratio * src[k][3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); - d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); - } + int4 dx1 = convert_int4_rte(ratio * src[0]); + int4 dy1 = convert_int4_rte(ratio * src[1]); + int4 dx2 = convert_int4_rte(ratio * src[2]); + int4 dy2 = convert_int4_rte(ratio * src[3]); + + F t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); + d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); + d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); + d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); return (float)d; } @@ -171,7 +182,7 @@ float icvCalcHaarPatternSum_3( // N = 4 float icvCalcHaarPatternSum_4( IMAGE_INT32 sumTex, - __constant float src[2][5], + __constant float4 *src, int oldSize, int newSize, int y, int x, @@ -182,21 +193,38 @@ float icvCalcHaarPatternSum_4( F d = 0; -#pragma unroll - for (int k = 0; k < 4; ++k) - { - int dx1 = convert_int_rte(ratio * src[k][0]); - int dy1 = convert_int_rte(ratio * src[k][1]); - int dx2 = convert_int_rte(ratio * src[k][2]); - int dy2 = convert_int_rte(ratio * src[k][3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); - d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); - } + int4 dx1 = convert_int4_rte(ratio * src[0]); + int4 dy1 = convert_int4_rte(ratio * src[1]); + int4 dx2 = convert_int4_rte(ratio * src[2]); + int4 dy2 = convert_int4_rte(ratio * src[3]); + + F t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); + d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); + d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); + d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow ); + t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow ); + t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow ); + d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w)); return (float)d; } @@ -204,9 +232,9 @@ float icvCalcHaarPatternSum_4( //////////////////////////////////////////////////////////////////////// // Hessian -__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} }; -__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; -__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; +__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) }; +__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) }; +__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets. __inline int calcSize(int octave, int layer) { @@ -236,7 +264,7 @@ __kernel void icvCalcLayerDetAndTrace( int c_octave, int c_layer_rows, int sumTex_step - ) +) { det_step /= sizeof(*det); trace_step /= sizeof(*trace); @@ -300,7 +328,7 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro // Non-maximal suppression to further filtering the candidates from previous step __kernel - void icvFindMaximaInLayer_withmask( +void icvFindMaximaInLayer_withmask( __global const float * det, __global const float * trace, __global int4 * maxPosBuffer, @@ -318,7 +346,7 @@ __kernel float c_hessianThreshold, IMAGE_INT32 maskSumTex, int mask_step - ) +) { volatile __local float N9[768]; // threads.x * threads.y * 3 @@ -347,26 +375,26 @@ __kernel const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; N9[localLin - zoff] = det[det_step * - (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x + (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y + + min(max(j, 0), c_img_cols - 1)]; // x N9[localLin ] = det[det_step * - (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x + (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y + + min(max(j, 0), c_img_cols - 1)]; // x N9[localLin + zoff] = det[det_step * - (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x + (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y + + min(max(j, 0), c_img_cols - 1)]; // x barrier(CLK_LOCAL_MEM_FENCE); if (i < c_layer_rows - margin - && j < c_layer_cols - margin - && get_local_id(0) > 0 - && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 - && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA - ) + && j < c_layer_cols - margin + && get_local_id(0) > 0 + && get_local_id(0) < get_local_size(0) - 1 + && get_local_id(1) > 0 + && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA + ) { float val0 = N9[localLin]; @@ -382,34 +410,34 @@ __kernel { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - get_local_size(0) - zoff] - && val0 > N9[localLin + 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - 1 - zoff] - && val0 > N9[localLin - zoff] - && val0 > N9[localLin + 1 - zoff] - && val0 > N9[localLin - 1 + get_local_size(0) - zoff] - && val0 > N9[localLin + get_local_size(0) - zoff] - && val0 > N9[localLin + 1 + get_local_size(0) - zoff] - - && val0 > N9[localLin - 1 - get_local_size(0)] - && val0 > N9[localLin - get_local_size(0)] - && val0 > N9[localLin + 1 - get_local_size(0)] - && val0 > N9[localLin - 1 ] - && val0 > N9[localLin + 1 ] - && val0 > N9[localLin - 1 + get_local_size(0)] - && val0 > N9[localLin + get_local_size(0)] - && val0 > N9[localLin + 1 + get_local_size(0)] - - && val0 > N9[localLin - 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - get_local_size(0) + zoff] - && val0 > N9[localLin + 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - 1 + zoff] - && val0 > N9[localLin + zoff] - && val0 > N9[localLin + 1 + zoff] - && val0 > N9[localLin - 1 + get_local_size(0) + zoff] - && val0 > N9[localLin + get_local_size(0) + zoff] - && val0 > N9[localLin + 1 + get_local_size(0) + zoff] - ; + && val0 > N9[localLin - get_local_size(0) - zoff] + && val0 > N9[localLin + 1 - get_local_size(0) - zoff] + && val0 > N9[localLin - 1 - zoff] + && val0 > N9[localLin - zoff] + && val0 > N9[localLin + 1 - zoff] + && val0 > N9[localLin - 1 + get_local_size(0) - zoff] + && val0 > N9[localLin + get_local_size(0) - zoff] + && val0 > N9[localLin + 1 + get_local_size(0) - zoff] + + && val0 > N9[localLin - 1 - get_local_size(0)] + && val0 > N9[localLin - get_local_size(0)] + && val0 > N9[localLin + 1 - get_local_size(0)] + && val0 > N9[localLin - 1 ] + && val0 > N9[localLin + 1 ] + && val0 > N9[localLin - 1 + get_local_size(0)] + && val0 > N9[localLin + get_local_size(0)] + && val0 > N9[localLin + 1 + get_local_size(0)] + + && val0 > N9[localLin - 1 - get_local_size(0) + zoff] + && val0 > N9[localLin - get_local_size(0) + zoff] + && val0 > N9[localLin + 1 - get_local_size(0) + zoff] + && val0 > N9[localLin - 1 + zoff] + && val0 > N9[localLin + zoff] + && val0 > N9[localLin + 1 + zoff] + && val0 > N9[localLin - 1 + get_local_size(0) + zoff] + && val0 > N9[localLin + get_local_size(0) + zoff] + && val0 > N9[localLin + 1 + get_local_size(0) + zoff] + ; if(condmax) { @@ -428,7 +456,7 @@ __kernel } __kernel - void icvFindMaximaInLayer( +void icvFindMaximaInLayer( __global float * det, __global float * trace, __global int4 * maxPosBuffer, @@ -444,7 +472,7 @@ __kernel int c_layer_cols, int c_max_candidates, float c_hessianThreshold - ) +) { volatile __local float N9[768]; // threads.x * threads.y * 3 @@ -483,12 +511,12 @@ __kernel barrier(CLK_LOCAL_MEM_FENCE); if (i < c_layer_rows - margin - && j < c_layer_cols - margin - && get_local_id(0) > 0 - && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 - && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA - ) + && j < c_layer_cols - margin + && get_local_id(0) > 0 + && get_local_id(0) < get_local_size(0) - 1 + && get_local_id(1) > 0 + && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA + ) { float val0 = N9[localLin]; if (val0 > c_hessianThreshold) @@ -499,38 +527,38 @@ __kernel // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - get_local_size(0) - zoff] - && val0 > N9[localLin + 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - 1 - zoff] - && val0 > N9[localLin - zoff] - && val0 > N9[localLin + 1 - zoff] - && val0 > N9[localLin - 1 + get_local_size(0) - zoff] - && val0 > N9[localLin + get_local_size(0) - zoff] - && val0 > N9[localLin + 1 + get_local_size(0) - zoff] - - && val0 > N9[localLin - 1 - get_local_size(0)] - && val0 > N9[localLin - get_local_size(0)] - && val0 > N9[localLin + 1 - get_local_size(0)] - && val0 > N9[localLin - 1 ] - && val0 > N9[localLin + 1 ] - && val0 > N9[localLin - 1 + get_local_size(0)] - && val0 > N9[localLin + get_local_size(0)] - && val0 > N9[localLin + 1 + get_local_size(0)] - - && val0 > N9[localLin - 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - get_local_size(0) + zoff] - && val0 > N9[localLin + 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - 1 + zoff] - && val0 > N9[localLin + zoff] - && val0 > N9[localLin + 1 + zoff] - && val0 > N9[localLin - 1 + get_local_size(0) + zoff] - && val0 > N9[localLin + get_local_size(0) + zoff] - && val0 > N9[localLin + 1 + get_local_size(0) + zoff] - ; + && val0 > N9[localLin - get_local_size(0) - zoff] + && val0 > N9[localLin + 1 - get_local_size(0) - zoff] + && val0 > N9[localLin - 1 - zoff] + && val0 > N9[localLin - zoff] + && val0 > N9[localLin + 1 - zoff] + && val0 > N9[localLin - 1 + get_local_size(0) - zoff] + && val0 > N9[localLin + get_local_size(0) - zoff] + && val0 > N9[localLin + 1 + get_local_size(0) - zoff] + + && val0 > N9[localLin - 1 - get_local_size(0)] + && val0 > N9[localLin - get_local_size(0)] + && val0 > N9[localLin + 1 - get_local_size(0)] + && val0 > N9[localLin - 1 ] + && val0 > N9[localLin + 1 ] + && val0 > N9[localLin - 1 + get_local_size(0)] + && val0 > N9[localLin + get_local_size(0)] + && val0 > N9[localLin + 1 + get_local_size(0)] + + && val0 > N9[localLin - 1 - get_local_size(0) + zoff] + && val0 > N9[localLin - get_local_size(0) + zoff] + && val0 > N9[localLin + 1 - get_local_size(0) + zoff] + && val0 > N9[localLin - 1 + zoff] + && val0 > N9[localLin + zoff] + && val0 > N9[localLin + 1 + zoff] + && val0 > N9[localLin - 1 + get_local_size(0) + zoff] + && val0 > N9[localLin + get_local_size(0) + zoff] + && val0 > N9[localLin + 1 + get_local_size(0) + zoff] + ; if(condmax) { - int ind = atomic_inc(maxCounter); + int ind = atomic_inc(maxCounter); if (ind < c_max_candidates) { @@ -544,30 +572,30 @@ __kernel } // solve 3x3 linear system Ax=b for floating point input -inline bool solve3x3_float(volatile __local const float A[3][3], volatile __local const float b[3], volatile __local float x[3]) +inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x) { - float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) - + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]); + float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y) + - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x) + + A[0].z * (A[1].x * A[2].y - A[1].y * A[2].x); if (det != 0) { F invdet = 1.0 / det; x[0] = invdet * - (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - - A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + - A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); + (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) - + A[0].y * (b[1] * A[2].z - A[1].z * b[2] ) + + A[0].z * (b[1] * A[2].y - A[1].y * b[2] )); x[1] = invdet * - (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - - b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + - A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); + (A[0].x * (b[1] * A[2].z - A[1].z * b[2] ) - + b[0] * (A[1].x * A[2].z - A[1].z * A[2].x) + + A[0].z * (A[1].x * b[2] - b[1] * A[2].x)); x[2] = invdet * - (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - - A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + - b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); + (A[0].x * (A[1].y * b[2] - b[1] * A[2].y) - + A[0].y * (A[1].x * b[2] - b[1] * A[2].x) + + b[0] * (A[1].x * A[2].y - A[1].y * A[2].x)); return true; } @@ -586,7 +614,7 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc //////////////////////////////////////////////////////////////////////// // INTERPOLATION __kernel - void icvInterpolateKeypoint( +void icvInterpolateKeypoint( __global const float * det, __global const int4 * maxPosBuffer, __global float * keypoints, @@ -598,7 +626,7 @@ __kernel int c_octave, int c_layer_rows, int c_max_features - ) +) { det_step /= sizeof(*det); keypoints_step /= sizeof(*keypoints); @@ -632,26 +660,26 @@ __kernel //ds dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); - volatile __local float H[3][3]; + volatile __local float4 H[3]; //dxx - H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; + H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; //dxy - H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]); + H[0].y= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]); //dxs - H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]); + H[0].z= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]); //dyx = dxy - H[1][0] = H[0][1]; + H[1].x = H[0].y; //dyy - H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1]; + H[1].y = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1]; //dys - H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]); + H[1].z= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]); //dsx = dxs - H[2][0] = H[0][2]; + H[2].x = H[0].z; //dsy = dys - H[2][1] = H[1][2]; + H[2].y = H[1].z; //dss - H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; + H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; volatile __local float x[3]; @@ -689,7 +717,7 @@ __kernel if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) { // Get a new feature index. - int ind = atomic_inc(featureCounter); + int ind = atomic_inc(featureCounter); if (ind < c_max_features) { @@ -716,31 +744,32 @@ __kernel __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, - 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, - 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, - 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, - 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, - 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, - 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, - 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, - 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, - 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, - 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, - 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, - 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, - 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, - 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, - 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, - 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, - 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, - 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, - 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, - 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, - 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, - 0.001707611023448408f, 0.001455130288377404f}; - -__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; -__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; + 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, + 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, + 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, + 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, + 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, + 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, + 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, + 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, + 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, + 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, + 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, + 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, + 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, + 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, + 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, + 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, + 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, + 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, + 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, + 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, + 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, + 0.001707611023448408f, 0.001455130288377404f + }; + +__constant float2 c_NX[5] = { (float2)(0, 2), (float2)(0, 0), (float2)(2, 4), (float2)(4, 4), (float2)(-1, 1) }; +__constant float2 c_NY[5] = { (float2)(0, 0), (float2)(0, 2), (float2)(4, 4), (float2)(2, 4), (float2)(1, -1) }; void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid) { @@ -759,14 +788,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc if (tid < 8) { #endif - data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8]); #if WAVE_SIZE < 8 } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) { #endif - data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4]); #if WAVE_SIZE < 4 } barrier(CLK_LOCAL_MEM_FENCE); @@ -787,14 +816,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc } __kernel - void icvCalcOrientation( +void icvCalcOrientation( IMAGE_INT32 sumTex, __global float * keypoints, int keypoints_step, int c_img_rows, int c_img_cols, int sum_step - ) +) { keypoints_step /= sizeof(*keypoints); sum_step /= sizeof(uint); @@ -838,7 +867,7 @@ __kernel const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin); if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && - x >= 0 && x < (c_img_cols + 1) - grad_wav_size) + x >= 0 && x < (c_img_cols + 1) - grad_wav_size) { X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); @@ -934,11 +963,11 @@ __kernel __kernel - void icvSetUpright( +void icvSetUpright( __global float * keypoints, int keypoints_step, int nFeatures - ) +) { keypoints_step /= sizeof(*keypoints); __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; @@ -988,7 +1017,7 @@ inline uchar readerGet( IMAGE_INT8 src, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, int i, int j, int rows, int cols, int elemPerRow - ) +) { float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; @@ -999,7 +1028,7 @@ inline float linearFilter( IMAGE_INT8 src, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, float y, float x, int rows, int cols, int elemPerRow - ) +) { x -= 0.5f; y -= 0.5f; @@ -1028,9 +1057,9 @@ inline float linearFilter( void calc_dx_dy( IMAGE_INT8 imgTex, - volatile __local float s_dx_bin[25], - volatile __local float s_dy_bin[25], - volatile __local float s_PATCH[6][6], + volatile __local float *s_dx_bin, + volatile __local float *s_dy_bin, + volatile __local float *s_PATCH, __global const float* featureX, __global const float* featureY, __global const float* featureSize, @@ -1038,7 +1067,7 @@ void calc_dx_dy( int rows, int cols, int elemPerRow - ) +) { const float centerX = featureX[get_group_id(0)]; const float centerY = featureY[get_group_id(0)]; @@ -1048,6 +1077,7 @@ void calc_dx_dy( { descriptor_dir = 0.0f; } + descriptor_dir *= (float)(CV_PI_F / 180.0f); /* The sampling intervals and wavelet sized for selecting an orientation @@ -1074,7 +1104,7 @@ void calc_dx_dy( const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; - s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); + s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); barrier(CLK_LOCAL_MEM_FENCE); @@ -1085,17 +1115,17 @@ void calc_dx_dy( const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float vx = ( - s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) - * dw; + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1] - + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] + + s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] - + s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ]) + * dw; const float vy = ( - s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) - * dw; + s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ] - + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] + + s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] - + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1]) + * dw; s_dx_bin[tid] = vx; s_dy_bin[tid] = vy; } @@ -1106,7 +1136,7 @@ void reduce_sum25( volatile __local float* sdata3, volatile __local float* sdata4, int tid - ) +) { #ifndef WAVE_SIZE #define WAVE_SIZE 1 @@ -1125,11 +1155,8 @@ void reduce_sum25( { #endif sdata1[tid] += sdata1[tid + 8]; - sdata2[tid] += sdata2[tid + 8]; - sdata3[tid] += sdata3[tid + 8]; - sdata4[tid] += sdata4[tid + 8]; #if WAVE_SIZE < 8 } @@ -1166,7 +1193,7 @@ void reduce_sum25( } __kernel - void compute_descriptors64( +void compute_descriptors64( IMAGE_INT8 imgTex, __global float * descriptors, __global const float * keypoints, @@ -1175,7 +1202,7 @@ __kernel int rows, int cols, int img_step - ) +) { descriptors_step /= sizeof(float); keypoints_step /= sizeof(float); @@ -1189,7 +1216,7 @@ __kernel volatile __local float sdy[25]; volatile __local float sdxabs[25]; volatile __local float sdyabs[25]; - volatile __local float s_PATCH[6][6]; + volatile __local float s_PATCH[6*6]; calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); barrier(CLK_LOCAL_MEM_FENCE); @@ -1203,7 +1230,7 @@ __kernel } barrier(CLK_LOCAL_MEM_FENCE); - reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); + reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); barrier(CLK_LOCAL_MEM_FENCE); if (tid < 25) @@ -1221,7 +1248,7 @@ __kernel } } __kernel - void compute_descriptors128( +void compute_descriptors128( IMAGE_INT8 imgTex, __global float * descriptors, __global float * keypoints, @@ -1230,7 +1257,7 @@ __kernel int rows, int cols, int img_step - ) +) { descriptors_step /= sizeof(*descriptors); keypoints_step /= sizeof(*keypoints); @@ -1249,7 +1276,7 @@ __kernel volatile __local float sd2[25]; volatile __local float sdabs1[25]; volatile __local float sdabs2[25]; - volatile __local float s_PATCH[6][6]; + volatile __local float s_PATCH[6*6]; calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); barrier(CLK_LOCAL_MEM_FENCE); @@ -1275,7 +1302,7 @@ __kernel } barrier(CLK_LOCAL_MEM_FENCE); - reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); + reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); barrier(CLK_LOCAL_MEM_FENCE); __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); @@ -1306,8 +1333,7 @@ __kernel } } barrier(CLK_LOCAL_MEM_FENCE); - - reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); + reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); barrier(CLK_LOCAL_MEM_FENCE); if (tid < 25) @@ -1322,11 +1348,13 @@ __kernel } } } + void reduce_sum128(volatile __local float* smem, int tid) { #ifndef WAVE_SIZE #define WAVE_SIZE 1 #endif + if (tid < 64) { smem[tid] += smem[tid + 64]; @@ -1374,6 +1402,8 @@ void reduce_sum128(volatile __local float* smem, int tid) smem[tid] += smem[tid + 1]; } } + + void reduce_sum64(volatile __local float* smem, int tid) { #ifndef WAVE_SIZE @@ -1421,7 +1451,7 @@ void reduce_sum64(volatile __local float* smem, int tid) } __kernel - void normalize_descriptors128(__global float * descriptors, int descriptors_step) +void normalize_descriptors128(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); // no need for thread ID @@ -1436,8 +1466,6 @@ __kernel reduce_sum128(sqDesc, get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE); - - // compute length (square root) volatile __local float len; if (get_local_id(0) == 0) @@ -1450,7 +1478,7 @@ __kernel descriptor_base[get_local_id(0)] = lookup / len; } __kernel - void normalize_descriptors64(__global float * descriptors, int descriptors_step) +void normalize_descriptors64(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); // no need for thread ID @@ -1462,7 +1490,6 @@ __kernel sqDesc[get_local_id(0)] = lookup * lookup; barrier(CLK_LOCAL_MEM_FENCE); - reduce_sum64(sqDesc, get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE);