Merge pull request #1730 from ilya-lavrenov:ocl_kern_warn

pull/1729/merge
Roman Donchenko 11 years ago committed by OpenCV Buildbot
commit a9a39a3aa2
  1. 2
      modules/ocl/src/opencl/arithm_bitwise_not.cl
  2. 22
      modules/ocl/src/opencl/arithm_cartToPolar.cl
  3. 200
      modules/ocl/src/opencl/arithm_minMax.cl
  4. 206
      modules/ocl/src/opencl/arithm_minMaxLoc.cl
  5. 182
      modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
  6. 8
      modules/ocl/src/opencl/arithm_nonzero.cl
  7. 18
      modules/ocl/src/opencl/arithm_phase.cl
  8. 13
      modules/ocl/src/opencl/arithm_polarToCart.cl
  9. 58
      modules/ocl/src/opencl/arithm_sum.cl
  10. 18
      modules/ocl/src/opencl/brute_force_match.cl
  11. 6
      modules/ocl/src/opencl/cvt_color.cl
  12. 1
      modules/ocl/src/opencl/haarobjectdetect.cl
  13. 1
      modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
  14. 9
      modules/ocl/src/opencl/imgproc_calcHarris.cl
  15. 9
      modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
  16. 12
      modules/ocl/src/opencl/imgproc_canny.cl
  17. 12
      modules/ocl/src/opencl/imgproc_clahe.cl
  18. 24
      modules/ocl/src/opencl/imgproc_integral.cl
  19. 16
      modules/ocl/src/opencl/imgproc_integral_sum.cl
  20. 32
      modules/ocl/src/opencl/imgproc_median.cl
  21. 18
      modules/ocl/src/opencl/imgproc_remap.cl
  22. 24
      modules/ocl/src/opencl/imgproc_resize.cl
  23. 20
      modules/ocl/src/opencl/imgproc_threshold.cl
  24. 8
      modules/ocl/src/opencl/imgproc_warpAffine.cl
  25. 34
      modules/ocl/src/opencl/imgproc_warpPerspective.cl
  26. 1
      modules/ocl/src/opencl/kernel_sort_by_key.cl
  27. 21
      modules/ocl/src/opencl/kernel_stablesort_by_key.cl
  28. 73
      modules/ocl/src/opencl/knearest.cl
  29. 6
      modules/ocl/src/opencl/match_template.cl
  30. 3
      modules/ocl/src/opencl/meanShift.cl
  31. 24
      modules/ocl/src/opencl/moments.cl
  32. 4
      modules/ocl/src/opencl/objdetect_hog.cl
  33. 4
      modules/ocl/src/opencl/optical_flow_farneback.cl
  34. 12
      modules/ocl/src/opencl/pyr_down.cl
  35. 29
      modules/ocl/src/opencl/pyrlk.cl
  36. 4
      modules/ocl/src/opencl/split_mat.cl
  37. 14
      modules/ocl/src/opencl/stereobm.cl
  38. 8
      modules/ocl/src/opencl/stereobp.cl
  39. 26
      modules/ocl/src/opencl/stereocsbp.cl

@ -67,7 +67,6 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
int src1_index = mad24(y, src1_step, x + src1_offset); int src1_index = mad24(y, src1_step, x + src1_offset);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x); int dst_index = mad24(y, dst_step, dst_offset + x);
@ -97,7 +96,6 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
int src1_index = mad24(y, src1_step, x + src1_offset); int src1_index = mad24(y, src1_step, x + src1_offset);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x); int dst_index = mad24(y, dst_step, dst_offset + x);

@ -44,14 +44,18 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#define CV_PI 3.1415926535897932384626433832795
#ifndef DBL_EPSILON
#define DBL_EPSILON 0x1.0p-52
#endif
#else
#define CV_PI 3.1415926535897932384626433832795f
#ifndef DBL_EPSILON
#define DBL_EPSILON 0x1.0p-52f
#endif
#endif #endif
#define CV_PI 3.1415926535897932384626433832795
#ifndef DBL_EPSILON
#define DBL_EPSILON 0x1.0p-52
#endif
__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset, __global float *src2, int src2_step, int src2_offset,
@ -82,9 +86,9 @@ __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int sr
float tmp = y >= 0 ? 0 : CV_PI*2; float tmp = y >= 0 ? 0 : CV_PI*2;
tmp = x < 0 ? CV_PI : tmp; tmp = x < 0 ? CV_PI : tmp;
float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5; float tmp1 = y >= 0 ? CV_PI*0.5f : CV_PI*1.5f;
cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON) + tmp : cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + DBL_EPSILON) + tmp :
tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON); tmp1 - x*y/(y2 + 0.28f*x2 + DBL_EPSILON);
cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI); cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI);

@ -66,53 +66,53 @@
__kernel void arithm_op_minMax(__global const T * src, __global T * dst, __kernel void arithm_op_minMax(__global const T * src, __global T * dst,
int cols, int invalid_cols, int offset, int elemnum, int groupnum) int cols, int invalid_cols, int offset, int elemnum, int groupnum)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols; int idx = offset + id + (id / cols) * invalid_cols;
__local T localmem_max[128], localmem_min[128]; __local T localmem_max[128], localmem_min[128];
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{ {
idx = offset + id + (id / cols) * invalid_cols; idx = offset + id + (id / cols) * invalid_cols;
temp = src[idx]; temp = src[idx];
minval = min(minval, temp); minval = min(minval, temp);
maxval = max(maxval, temp); maxval = max(maxval, temp);
} }
if (lid > 127) if (lid > 127)
{ {
localmem_min[lid - 128] = minval; localmem_min[lid - 128] = minval;
localmem_max[lid - 128] = maxval; localmem_max[lid - 128] = maxval;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128) if (lid < 128)
{ {
localmem_min[lid] = min(minval, localmem_min[lid]); localmem_min[lid] = min(minval, localmem_min[lid]);
localmem_max[lid] = max(maxval, localmem_max[lid]); localmem_max[lid] = max(maxval, localmem_max[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1) for (int lsize = 64; lsize > 0; lsize >>= 1)
{ {
if (lid < lsize) if (lid < lsize)
{ {
int lid2 = lsize + lid; int lid2 = lsize + lid;
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (lid == 0) if (lid == 0)
{ {
dst[gid] = localmem_min[0]; dst[gid] = localmem_min[0];
dst[gid + groupnum] = localmem_max[0]; dst[gid + groupnum] = localmem_max[0];
} }
} }
__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst, __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
@ -120,57 +120,57 @@ __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
int elemnum, int groupnum, int elemnum, int groupnum,
const __global uchar * mask, int minvalid_cols, int moffset) const __global uchar * mask, int minvalid_cols, int moffset)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols; int idx = offset + id + (id / cols) * invalid_cols;
unsigned int midx = moffset + id + (id / cols) * minvalid_cols; int midx = moffset + id + (id / cols) * minvalid_cols;
__local T localmem_max[128], localmem_min[128]; __local T localmem_max[128], localmem_min[128];
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{ {
idx = offset + id + (id / cols) * invalid_cols; idx = offset + id + (id / cols) * invalid_cols;
midx = moffset + id + (id / cols) * minvalid_cols; midx = moffset + id + (id / cols) * minvalid_cols;
if (mask[midx]) if (mask[midx])
{ {
temp = src[idx]; temp = src[idx];
minval = min(minval, temp); minval = min(minval, temp);
maxval = max(maxval, temp); maxval = max(maxval, temp);
} }
} }
if (lid > 127) if (lid > 127)
{ {
localmem_min[lid - 128] = minval; localmem_min[lid - 128] = minval;
localmem_max[lid - 128] = maxval; localmem_max[lid - 128] = maxval;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128) if (lid < 128)
{ {
localmem_min[lid] = min(minval, localmem_min[lid]); localmem_min[lid] = min(minval, localmem_min[lid]);
localmem_max[lid] = max(maxval, localmem_max[lid]); localmem_max[lid] = max(maxval, localmem_max[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1) for (int lsize = 64; lsize > 0; lsize >>= 1)
{ {
if (lid < lsize) if (lid < lsize)
{ {
int lid2 = lsize + lid; int lid2 = lsize + lid;
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (lid == 0) if (lid == 0)
{ {
dst[gid] = localmem_min[0]; dst[gid] = localmem_min[0];
dst[gid + groupnum] = localmem_max[0]; dst[gid + groupnum] = localmem_max[0];
} }
} }

@ -137,118 +137,114 @@
#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; #define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
#endif #endif
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
/**************************************Array minMax**************************************/ /**************************************Array minMax**************************************/
__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum, __kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
__global VEC_TYPE *src, __global RES_TYPE *dst) __global VEC_TYPE *src, __global RES_TYPE *dst)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols; int idx = offset + id + (id / cols) * invalid_cols;
__local VEC_TYPE localmem_max[128], localmem_min[128]; __local VEC_TYPE localmem_max[128], localmem_min[128];
VEC_TYPE minval, maxval, temp; VEC_TYPE minval, maxval, temp;
__local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128]; __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1; VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
int idx_c; int idx_c;
if (id < elemnum) if (id < elemnum)
{ {
temp = src[idx]; temp = src[idx];
idx_c = idx << 2; idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3); temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
if (id % cols == 0 ) if (id % cols == 0 )
{ {
repeat_s(temp); repeat_s(temp);
repeat_s(temploc); repeat_s(temploc);
} }
if (id % cols == cols - 1) if (id % cols == cols - 1)
{ {
repeat_e(temp); repeat_e(temp);
repeat_e(temploc); repeat_e(temploc);
} }
minval = temp; minval = temp;
maxval = temp; maxval = temp;
minloc = temploc; minloc = temploc;
maxloc = temploc; maxloc = temploc;
} }
else else
{ {
minval = MAX_VAL; minval = MAX_VAL;
maxval = MIN_VAL; maxval = MIN_VAL;
minloc = negative; minloc = negative;
maxloc = negative; maxloc = negative;
} }
int grainSize = (groupnum << 8); int grainSize = (groupnum << 8);
for (id = id + grainSize; id < elemnum; id = id + grainSize) for (id = id + grainSize; id < elemnum; id = id + grainSize)
{ {
idx = offset + id + (id / cols) * invalid_cols; idx = offset + id + (id / cols) * invalid_cols;
temp = src[idx]; temp = src[idx];
idx_c = idx << 2; idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3); temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
if (id % cols == 0 ) if (id % cols == 0 )
{ {
repeat_s(temp); repeat_s(temp);
repeat_s(temploc); repeat_s(temploc);
} }
if (id % cols == cols - 1) if (id % cols == cols - 1)
{ {
repeat_e(temp); repeat_e(temp);
repeat_e(temploc); repeat_e(temploc);
} }
minval = min(minval, temp); minval = min(minval, temp);
maxval = max(maxval, temp); maxval = max(maxval, temp);
minloc = CONDITION_FUNC(minval == temp, temploc, minloc); minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc); maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
} }
if (lid > 127) if (lid > 127)
{ {
localmem_min[lid - 128] = minval; localmem_min[lid - 128] = minval;
localmem_max[lid - 128] = maxval; localmem_max[lid - 128] = maxval;
localmem_minloc[lid - 128] = minloc; localmem_minloc[lid - 128] = minloc;
localmem_maxloc[lid - 128] = maxloc; localmem_maxloc[lid - 128] = maxloc;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128) if (lid < 128)
{ {
localmem_min[lid] = min(minval,localmem_min[lid]); localmem_min[lid] = min(minval,localmem_min[lid]);
localmem_max[lid] = max(maxval,localmem_max[lid]); localmem_max[lid] = max(maxval,localmem_max[lid]);
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]); localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]); localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1) for (int lsize = 64; lsize > 0; lsize >>= 1)
{ {
if (lid < lsize) if (lid < lsize)
{ {
int lid2 = lsize + lid; int lid2 = lsize + lid;
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]); localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]); localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if ( lid == 0) if ( lid == 0)
{ {
dst[gid] = CONVERT_RES_TYPE(localmem_min[0]); dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]); dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]); dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]); dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
} }
} }

@ -147,96 +147,96 @@
__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src, __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src,
int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE *dst) int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE *dst)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = id + (id / cols) * invalid_cols; int idx = id + (id / cols) * invalid_cols;
unsigned int midx = id + (id / cols) * minvalid_cols; int midx = id + (id / cols) * minvalid_cols;
__local VEC_TYPE lm_max[128],lm_min[128]; __local VEC_TYPE lm_max[128],lm_min[128];
VEC_TYPE minval,maxval,temp,m_temp; VEC_TYPE minval,maxval,temp,m_temp;
__local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128]; __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0; VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
if(id < elemnum) if(id < elemnum)
{ {
temp = vload4(idx, &src[offset]); temp = vload4(idx, &src[offset]);
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
int idx_c = (idx << 2) + offset; int idx_c = (idx << 2) + offset;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == cols - 1) if(id % cols == cols - 1)
{ {
repeat_me(m_temp); repeat_me(m_temp);
repeat_e(temploc); repeat_e(temploc);
} }
minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL; minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL; maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative); minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
maxloc = minloc; maxloc = minloc;
} }
else else
{ {
minval = MAX_VAL; minval = MAX_VAL;
maxval = MIN_VAL; maxval = MIN_VAL;
minloc = negative; minloc = negative;
maxloc = negative; maxloc = negative;
} }
for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
{ {
idx = id + (id / cols) * invalid_cols; idx = id + (id / cols) * invalid_cols;
midx = id + (id / cols) * minvalid_cols; midx = id + (id / cols) * minvalid_cols;
temp = vload4(idx, &src[offset]); temp = vload4(idx, &src[offset]);
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
int idx_c = (idx << 2) + offset; int idx_c = (idx << 2) + offset;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == cols - 1) if(id % cols == cols - 1)
{ {
repeat_me(m_temp); repeat_me(m_temp);
repeat_e(temploc); repeat_e(temploc);
} }
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval); minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval); maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc); minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc); maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
} }
if(lid > 127) if(lid > 127)
{ {
lm_min[lid - 128] = minval; lm_min[lid - 128] = minval;
lm_max[lid - 128] = maxval; lm_max[lid - 128] = maxval;
lm_minloc[lid - 128] = minloc; lm_minloc[lid - 128] = minloc;
lm_maxloc[lid - 128] = maxloc; lm_maxloc[lid - 128] = maxloc;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 128) if(lid < 128)
{ {
lm_min[lid] = min(minval,lm_min[lid]); lm_min[lid] = min(minval,lm_min[lid]);
lm_max[lid] = max(maxval,lm_max[lid]); lm_max[lid] = max(maxval,lm_max[lid]);
VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero); VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero); VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]); lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]); lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(int lsize = 64; lsize > 0; lsize >>= 1) for(int lsize = 64; lsize > 0; lsize >>= 1)
{ {
if(lid < lsize) if(lid < lsize)
{ {
int lid2 = lsize + lid; int lid2 = lsize + lid;
lm_min[lid] = min(lm_min[lid] , lm_min[lid2]); lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
lm_max[lid] = max(lm_max[lid] , lm_max[lid2]); lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero); VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero); VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
lm_minloc[lid] = lm_minloc[lid] =
CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]); CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
lm_maxloc[lid] = lm_maxloc[lid] =
CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]); CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if( lid == 0) if( lid == 0)
{ {
dst[gid] = CONVERT_RES_TYPE(lm_min[0]); dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]); dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]); dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
} }
} }

@ -55,11 +55,11 @@
__kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum, __kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
__global srcT *src, __global dstT *dst) __global srcT *src, __global dstT *dst)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols; int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_nonzero[128]; __local dstT localmem_nonzero[128];
dstT nonzero = (dstT)(0); dstT nonzero = (dstT)(0);
srcT zero = (srcT)(0), one = (srcT)(1); srcT zero = (srcT)(0), one = (srcT)(1);

@ -45,15 +45,17 @@
// //
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64 #ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64) #elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable #pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#define CV_PI 3.1415926535897932384626433832795
#define CV_2PI 2*CV_PI
#else
#define CV_PI 3.1415926535897932384626433832795f
#define CV_2PI 2*CV_PI
#endif #endif
#endif
#define CV_PI 3.1415926535898
#define CV_2PI 2*3.1415926535898
/**************************************phase inradians**************************************/ /**************************************phase inradians**************************************/

@ -43,12 +43,13 @@
// //
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #ifdef DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#define CV_PI 3.1415926535897932384626433832795
#else
#define CV_PI 3.1415926535897932384626433832795f
#endif #endif
#define CV_PI 3.1415926535897932384626433832795
///////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////polarToCart with magnitude////////////////////////////// /////////////////////////////////////////polarToCart with magnitude//////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
@ -72,7 +73,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
float x = *((__global float *)((__global char *)src1 + src1_index)); float x = *((__global float *)((__global char *)src1 + src1_index));
float y = *((__global float *)((__global char *)src2 + src2_index)); float y = *((__global float *)((__global char *)src2 + src2_index));
float ascale = CV_PI/180.0; float ascale = CV_PI/180.0f;
float alpha = angInDegree == 1 ? y * ascale : y; float alpha = angInDegree == 1 ? y * ascale : y;
float a = cos(alpha) * x; float a = cos(alpha) * x;
float b = sin(alpha) * x; float b = sin(alpha) * x;
@ -134,7 +135,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
float y = *((__global float *)((__global char *)src + src_index)); float y = *((__global float *)((__global char *)src + src_index));
float ascale = CV_PI/180.0; float ascale = CV_PI/180.0f;
float alpha = angInDegree == 1 ? y * ascale : y; float alpha = angInDegree == 1 ? y * ascale : y;
float a = cos(alpha); float a = cos(alpha);
float b = sin(alpha); float b = sin(alpha);

@ -66,39 +66,39 @@
__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, __kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global srcT *src, __global dstT *dst) __global srcT *src, __global dstT *dst)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
unsigned int id = get_global_id(0); int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols; int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_sum[128]; __local dstT localmem_sum[128];
dstT sum = (dstT)(0), temp; dstT sum = (dstT)(0), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{ {
idx = offset + id + (id / cols) * invalid_cols; idx = offset + id + (id / cols) * invalid_cols;
temp = convertToDstT(src[idx]); temp = convertToDstT(src[idx]);
FUNC(temp, sum); FUNC(temp, sum);
} }
if (lid > 127) if (lid > 127)
localmem_sum[lid - 128] = sum; localmem_sum[lid - 128] = sum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128) if (lid < 128)
localmem_sum[lid] = sum + localmem_sum[lid]; localmem_sum[lid] = sum + localmem_sum[lid];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1) for (int lsize = 64; lsize > 0; lsize >>= 1)
{ {
if (lid < lsize) if (lid < lsize)
{ {
int lid2 = lsize + lid; int lid2 = lsize + lid;
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (lid == 0) if (lid == 0)
dst[gid] = localmem_sum[0]; dst[gid] = localmem_sum[0];
} }

@ -64,7 +64,7 @@
#endif #endif
//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel //http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
int bit1Count(int v) static int bit1Count(int v)
{ {
v = v - ((v >> 1) & 0x55555555); // reuse input as temporary v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
@ -95,7 +95,7 @@ typedef int result_type;
#define DIST_RES(x) (x) #define DIST_RES(x) (x)
#endif #endif
result_type reduce_block( static result_type reduce_block(
__local value_type *s_query, __local value_type *s_query,
__local value_type *s_train, __local value_type *s_train,
int lidx, int lidx,
@ -113,7 +113,7 @@ result_type reduce_block(
return DIST_RES(result); return DIST_RES(result);
} }
result_type reduce_block_match( static result_type reduce_block_match(
__local value_type *s_query, __local value_type *s_query,
__local value_type *s_train, __local value_type *s_train,
int lidx, int lidx,
@ -131,7 +131,7 @@ result_type reduce_block_match(
return (result); return (result);
} }
result_type reduce_multi_block( static result_type reduce_multi_block(
__local value_type *s_query, __local value_type *s_query,
__local value_type *s_train, __local value_type *s_train,
int block_index, int block_index,
@ -187,7 +187,6 @@ __kernel void BruteForceMatch_UnrollMatch(
int myBestTrainIdx = -1; int myBestTrainIdx = -1;
// loopUnrolledCached to find the best trainIdx and best distance. // loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0;
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
{ {
result_type result = 0; result_type result = 0;
@ -212,7 +211,6 @@ __kernel void BruteForceMatch_UnrollMatch(
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{ {
//bestImgIdx = imgIdx;
myBestDistance = result; myBestDistance = result;
myBestTrainIdx = trainIdx; myBestTrainIdx = trainIdx;
} }
@ -304,7 +302,6 @@ __kernel void BruteForceMatch_Match(
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{ {
//myBestImgidx = imgIdx;
myBestDistance = result; myBestDistance = result;
myBestTrainIdx = trainIdx; myBestTrainIdx = trainIdx;
} }
@ -390,11 +387,10 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
if (queryIdx < query_rows && trainIdx < train_rows && if (queryIdx < query_rows && trainIdx < train_rows &&
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
{ {
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if(ind < bestTrainIdx_cols) if(ind < bestTrainIdx_cols)
{ {
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
} }
@ -451,11 +447,10 @@ __kernel void BruteForceMatch_RadiusMatch(
if (queryIdx < query_rows && trainIdx < train_rows && if (queryIdx < query_rows && trainIdx < train_rows &&
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
{ {
unsigned int ind = atom_inc(nMatches + queryIdx); int ind = atom_inc(nMatches + queryIdx);
if(ind < bestTrainIdx_cols) if(ind < bestTrainIdx_cols)
{ {
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
} }
@ -498,7 +493,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
int myBestTrainIdx2 = -1; int myBestTrainIdx2 = -1;
//loopUnrolledCached //loopUnrolledCached
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{ {
result_type result = 0; result_type result = 0;

@ -50,8 +50,6 @@
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif #endif
#define DATA_TYPE UNDEFINED
#if defined (DEPTH_0) #if defined (DEPTH_0)
#define DATA_TYPE uchar #define DATA_TYPE uchar
#define MAX_NUM 255 #define MAX_NUM 255
@ -73,6 +71,10 @@
#define SAT_CAST(num) (num) #define SAT_CAST(num) (num)
#endif #endif
#ifndef DATA_TYPE
#define DATA_TYPE UNDEFINED
#endif
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum enum

@ -37,7 +37,6 @@
// //
// //
#pragma OPENCL EXTENSION cl_amd_printf : enable
#define CV_HAAR_FEATURE_MAX 3 #define CV_HAAR_FEATURE_MAX 3
#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset]) #define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])

@ -120,7 +120,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int grpidx = get_group_id(0); int grpidx = get_group_id(0);
int lclidx = get_local_id(0); int lclidx = get_local_id(0);
int lclidy = get_local_id(1); int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx, grpszy);
int lcl_id = mad24(lclidy, grpszx, lclidx); int lcl_id = mad24(lclidy, grpszx, lclidx);
__local int glboutindex[1]; __local int glboutindex[1];
__local int lclcount[1]; __local int lclcount[1];

@ -99,7 +99,6 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
int col = get_local_id(0); int col = get_local_id(0);
int gX = get_group_id(0); int gX = get_group_id(0);
int gY = get_group_id(1); int gY = get_group_id(1);
int glx = get_global_id(0);
int gly = get_global_id(1); int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2; int dx_x_off = (dx_offset % dx_step) >> 2;
@ -126,11 +125,11 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
{ {
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0; dx_data[i] = dx_con ? dx_s : 0.0f;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0; dy_data[i] = dy_con ? dy_s : 0.0f;
data[0][i] = dx_data[i] * dx_data[i]; data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i]; data[1][i] = dx_data[i] * dy_data[i];
@ -155,7 +154,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
data[2][i] = dy_data[i] * dy_data[i]; data[2][i] = dy_data[i] * dy_data[i];
} }
#endif #endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
for (int i=1; i < ksY; i++) for (int i=1; i < ksY; i++)
{ {
sum0 += data[0][i]; sum0 += data[0][i];
@ -183,7 +182,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
int posX = dst_startX - dst_x_off + col - anX; int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1); int posY = (gly << 1);
int till = (ksX + 1)%2; int till = (ksX + 1)%2;
float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
for (int k=0; k<6; k++) for (int k=0; k<6; k++)
for (int i=-anX; i<=anX - till; i++) for (int i=-anX; i<=anX - till; i++)
tmp_sum[k] += temp[k][col+i]; tmp_sum[k] += temp[k][col+i];

@ -98,7 +98,6 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
int col = get_local_id(0); int col = get_local_id(0);
int gX = get_group_id(0); int gX = get_group_id(0);
int gY = get_group_id(1); int gY = get_group_id(1);
int glx = get_global_id(0);
int gly = get_global_id(1); int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2; int dx_x_off = (dx_offset % dx_step) >> 2;
@ -125,10 +124,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
{ {
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0; dx_data[i] = dx_con ? dx_s : 0.0f;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0; dy_data[i] = dy_con ? dy_s : 0.0f;
data[0][i] = dx_data[i] * dx_data[i]; data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i]; data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i]; data[2][i] = dy_data[i] * dy_data[i];
@ -152,7 +151,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
data[2][i] = dy_data[i] * dy_data[i]; data[2][i] = dy_data[i] * dy_data[i];
} }
#endif #endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
for (int i=1; i < ksY; i++) for (int i=1; i < ksY; i++)
{ {
sum0 += (data[0][i]); sum0 += (data[0][i]);
@ -180,7 +179,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
int posX = dst_startX - dst_x_off + col - anX; int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1); int posY = (gly << 1);
int till = (ksX + 1)%2; int till = (ksX + 1)%2;
float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; float tmp_sum[6] = { 0.0f, 0.0f , 0.0f, 0.0f, 0.0f, 0.0f };
for (int k=0; k<6; k++) for (int k=0; k<6; k++)
for (int i=-anX; i<=anX - till; i++) for (int i=-anX; i<=anX - till; i++)
tmp_sum[k] += temp[k][col+i]; tmp_sum[k] += temp[k][col+i];

@ -43,9 +43,6 @@
// //
//M*/ //M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#ifdef L2GRAD #ifdef L2GRAD
inline float calc(int x, int y) inline float calc(int x, int y)
{ {
@ -248,7 +245,12 @@ void calcMagnitude
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// 0.4142135623730950488016887242097 is tan(22.5) // 0.4142135623730950488016887242097 is tan(22.5)
#define CANNY_SHIFT 15 #define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
#ifdef DOUBLE_SUPPORT
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
#else
#define TG22 (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
#endif
//First pass of edge detection and non-maximum suppression //First pass of edge detection and non-maximum suppression
// edgetype is set to for each pixel: // edgetype is set to for each pixel:
@ -681,7 +683,7 @@ edgesHysteresisGlobal
ind = s_ind; ind = s_ind;
for (int i = lidx; i < s_counter; i += get_local_size(0)) for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
{ {
st2[ind + i] = s_st[i]; st2[ind + i] = s_st[i];
} }

@ -47,7 +47,7 @@
#define WAVE_SIZE 1 #define WAVE_SIZE 1
#endif #endif
int calc_lut(__local int* smem, int val, int tid) static int calc_lut(__local int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -61,7 +61,7 @@ int calc_lut(__local int* smem, int val, int tid)
} }
#ifdef CPU #ifdef CPU
void reduce(volatile __local int* smem, int val, int tid) static void reduce(volatile __local int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -101,7 +101,7 @@ void reduce(volatile __local int* smem, int val, int tid)
#else #else
void reduce(__local volatile int* smem, int val, int tid) static void reduce(__local volatile int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -147,9 +147,9 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
{ {
__local int smem[512]; __local int smem[512];
const int tx = get_group_id(0); int tx = get_group_id(0);
const int ty = get_group_id(1); int ty = get_group_id(1);
const unsigned int tid = get_local_id(1) * get_local_size(0) int tid = get_local_id(1) * get_local_size(0)
+ get_local_id(0); + get_local_id(0);
smem[tid] = 0; smem[tid] = 0;

@ -63,8 +63,8 @@
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum, kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
int4 src_t[2], sum_t[2]; int4 src_t[2], sum_t[2];
float4 sqsum_t[2]; float4 sqsum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
@ -75,8 +75,8 @@ kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global fl
gid = gid << 1; gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
@ -163,8 +163,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
__global float *sqsum,int rows,int cols,int src_step,int sum_step, __global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset) int sqsum_step,int sum_offset,int sqsum_offset)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
int4 src_t[2], sum_t[2]; int4 src_t[2], sum_t[2];
float4 sqsrc_t[2],sqsum_t[2]; float4 sqsrc_t[2],sqsum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
@ -279,8 +279,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum, kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
float4 src_t[2], sum_t[2]; float4 src_t[2], sum_t[2];
float4 sqsum_t[2]; float4 sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
@ -291,8 +291,8 @@ kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global
gid = gid << 1; gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0); src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0); src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
@ -379,8 +379,8 @@ kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,
__global float *sqsum,int rows,int cols,int src_step,int sum_step, __global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset) int sqsum_step,int sum_offset,int sqsum_offset)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
float4 src_t[2], sum_t[2]; float4 src_t[2], sum_t[2];
float4 sqsrc_t[2],sqsum_t[2]; float4 sqsrc_t[2],sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; __local float4 lm_sum[2][LSIZE + LOG_LSIZE];

@ -64,8 +64,8 @@
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum , kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
int4 src_t[2], sum_t[2]; int4 src_t[2], sum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local int* sum_p; __local int* sum_p;
@ -146,8 +146,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
int rows,int cols,int src_step,int sum_step, int rows,int cols,int src_step,int sum_step,
int sum_offset) int sum_offset)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
int4 src_t[2], sum_t[2]; int4 src_t[2], sum_t[2];
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
__local int *sum_p; __local int *sum_p;
@ -239,8 +239,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum , kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
float4 src_t[2], sum_t[2]; float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float* sum_p; __local float* sum_p;
@ -321,8 +321,8 @@ kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
int rows,int cols,int src_step,int sum_step, int rows,int cols,int src_step,int sum_step,
int sum_offset) int sum_offset)
{ {
unsigned int lid = get_local_id(0); int lid = get_local_id(0);
unsigned int gid = get_group_id(0); int gid = get_group_id(0);
float4 src_t[2], sum_t[2]; float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float *sum_p; __local float *sum_p;

@ -106,10 +106,10 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2); op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
@ -148,10 +148,10 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, i
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2); op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
@ -190,10 +190,10 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, i
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2); op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
@ -232,10 +232,10 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2); op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
@ -294,10 +294,10 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12); op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
@ -356,10 +356,10 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12); op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
@ -418,10 +418,10 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12); op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
} }
#undef op(a,b) #undef op
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols, __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
@ -480,7 +480,7 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12); op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols) if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
} }
#undef op(a,b) #undef op

@ -60,7 +60,7 @@
#elif defined BORDER_REPLICATE #elif defined BORDER_REPLICATE
#define EXTRAPOLATE(v2, v) \ #define EXTRAPOLATE(v2, v) \
{ \ { \
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \ v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
} }
#elif defined BORDER_WRAP #elif defined BORDER_WRAP
@ -139,7 +139,9 @@ __kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
if (NEED_EXTRAPOLATION(gx, gy)) if (NEED_EXTRAPOLATION(gx, gy))
{ {
int2 gxy = (int2)(gx, gy), zero = (int2)(0); #ifndef BORDER_CONSTANT
int2 gxy = (int2)(gx, gy);
#endif
EXTRAPOLATE(gxy, dst[dstIdx]); EXTRAPOLATE(gxy, dst[dstIdx]);
} }
else else
@ -167,10 +169,7 @@ __kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __g
int gx = gxy.x, gy = gxy.y; int gx = gxy.x, gy = gxy.y;
if (NEED_EXTRAPOLATION(gx, gy)) if (NEED_EXTRAPOLATION(gx, gy))
{ EXTRAPOLATE(gxy, dst[dstIdx])
int2 zero = (int2)(0);
EXTRAPOLATE(gxy, dst[dstIdx]);
}
else else
{ {
int srcIdx = mad24(gy, src_step, gx + src_offset); int srcIdx = mad24(gy, src_step, gx + src_offset);
@ -196,10 +195,7 @@ __kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __g
int gx = gxy.x, gy = gxy.y; int gx = gxy.x, gy = gxy.y;
if (NEED_EXTRAPOLATION(gx, gy)) if (NEED_EXTRAPOLATION(gx, gy))
{ EXTRAPOLATE(gxy, dst[dstIdx])
int2 zero = (int2)(0);
EXTRAPOLATE(gxy, dst[dstIdx]);
}
else else
{ {
int srcIdx = mad24(gy, src_step, gx + src_offset); int srcIdx = mad24(gy, src_step, gx + src_offset);
@ -231,7 +227,6 @@ __kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst,
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
int2 zero = (int2)(0);
float2 _u = map_data - convert_float2(map_dataA); float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
@ -285,7 +280,6 @@ __kernel void remap_32FC2(__global T const * restrict src, __global T * dst,
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
int2 zero = (int2)(0);
float2 _u = map_data - convert_float2(map_dataA); float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;

@ -182,10 +182,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int x = floor(sx), y = floor(sy); int x = floor(sx), y = floor(sy);
float u = sx - x, v = sy - y; float u = sx - x, v = sy - y;
x<0 ? x=0,u=0 : x,u; if ( x<0 ) x=0,u=0;
x>=src_cols ? x=src_cols-1,u=0 : x,u; if ( x>=src_cols ) x=src_cols-1,u=0;
y<0 ? y=0,v=0 : y,v; if ( y<0 ) y=0,v=0;
y>=src_rows ? y=src_rows-1,v=0 : y,v; if (y>=src_rows ) y=src_rows-1,v=0;
u = u * INTER_RESIZE_COEF_SCALE; u = u * INTER_RESIZE_COEF_SCALE;
v = v * INTER_RESIZE_COEF_SCALE; v = v * INTER_RESIZE_COEF_SCALE;
@ -225,10 +225,10 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
int x = floor(sx), y = floor(sy); int x = floor(sx), y = floor(sy);
float u = sx - x, v = sy - y; float u = sx - x, v = sy - y;
x<0 ? x=0,u=0 : x,u; if ( x<0 ) x=0,u=0;
x>=src_cols ? x=src_cols-1,u=0 : x,u; if ( x>=src_cols ) x=src_cols-1,u=0;
y<0 ? y=0,v=0 : y,v; if ( y<0 ) y=0,v=0;
y>=src_rows ? y=src_rows-1,v=0 : y,v; if (y>=src_rows ) y=src_rows-1,v=0;
int y_ = INC(y,src_rows); int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols); int x_ = INC(x,src_cols);
@ -264,10 +264,10 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
int x = floor(sx), y = floor(sy); int x = floor(sx), y = floor(sy);
float u = sx - x, v = sy - y; float u = sx - x, v = sy - y;
x<0 ? x=0,u=0 : x; if ( x<0 ) x=0,u=0;
x>=src_cols ? x=src_cols-1,u=0 : x; if ( x>=src_cols ) x=src_cols-1,u=0;
y<0 ? y=0,v=0 : y; if ( y<0 ) y=0,v=0;
y>=src_rows ? y=src_rows-1,v=0 : y; if (y>=src_rows ) y=src_rows-1,v=0;
int y_ = INC(y,src_rows); int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols); int x_ = INC(x,src_cols);

@ -71,18 +71,18 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src
#else #else
VT sdata = VLOADN(0, src + src_index); VT sdata = VLOADN(0, src + src_index);
#endif #endif
VT vthresh = (VT)(thresh), zero = (VT)(0); VT vthresh = (VT)(thresh);
#ifdef THRESH_BINARY #ifdef THRESH_BINARY
VT vecValue = sdata > vthresh ? max_val : zero; VT vecValue = sdata > vthresh ? max_val : (VT)(0);
#elif defined THRESH_BINARY_INV #elif defined THRESH_BINARY_INV
VT vecValue = sdata > vthresh ? zero : max_val; VT vecValue = sdata > vthresh ? (VT)(0) : max_val;
#elif defined THRESH_TRUNC #elif defined THRESH_TRUNC
VT vecValue = sdata > vthresh ? thresh : sdata; VT vecValue = sdata > vthresh ? thresh : sdata;
#elif defined THRESH_TOZERO #elif defined THRESH_TOZERO
VT vecValue = sdata > vthresh ? sdata : zero; VT vecValue = sdata > vthresh ? sdata : (VT)(0);
#elif defined THRESH_TOZERO_INV #elif defined THRESH_TOZERO_INV
VT vecValue = sdata > vthresh ? zero : sdata; VT vecValue = sdata > vthresh ? (VT)(0) : sdata;
#endif #endif
if (gx + VECSIZE <= max_index) if (gx + VECSIZE <= max_index)
@ -117,18 +117,18 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src
int src_index = mad24(gy, src_step, src_offset + gx); int src_index = mad24(gy, src_step, src_offset + gx);
int dst_index = mad24(gy, dst_step, dst_offset + gx); int dst_index = mad24(gy, dst_step, dst_offset + gx);
T sdata = src[src_index], zero = (T)(0); T sdata = src[src_index];
#ifdef THRESH_BINARY #ifdef THRESH_BINARY
dst[dst_index] = sdata > thresh ? max_val : zero; dst[dst_index] = sdata > thresh ? max_val : (T)(0);
#elif defined THRESH_BINARY_INV #elif defined THRESH_BINARY_INV
dst[dst_index] = sdata > thresh ? zero : max_val; dst[dst_index] = sdata > thresh ? (T)(0) : max_val;
#elif defined THRESH_TRUNC #elif defined THRESH_TRUNC
dst[dst_index] = sdata > thresh ? thresh : sdata; dst[dst_index] = sdata > thresh ? thresh : sdata;
#elif defined THRESH_TOZERO #elif defined THRESH_TOZERO
dst[dst_index] = sdata > thresh ? sdata : zero; dst[dst_index] = sdata > thresh ? sdata : (T)(0);
#elif defined THRESH_TOZERO_INV #elif defined THRESH_TOZERO_INV
dst[dst_index] = sdata > thresh ? zero : sdata; dst[dst_index] = sdata > thresh ? (T)(0) : sdata;
#endif #endif
} }
} }

@ -537,9 +537,9 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0]; tab[0] = taby[0] * tabx[0];
@ -680,9 +680,9 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0]; tab[0] = taby[0] * tabx[0];

@ -133,7 +133,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -150,9 +150,9 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
short itab[4]; short itab[4];
float tab1y[2], tab1x[2]; float tab1y[2], tab1x[2];
tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; tab1y[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
tab1y[1] = 1.f/INTER_TAB_SIZE*ay; tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; tab1x[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
tab1x[1] = 1.f/INTER_TAB_SIZE*ax; tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
#pragma unroll 4 #pragma unroll 4
@ -185,7 +185,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -265,7 +265,7 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? 1./W : 0.0; W = (W != 0.0f) ? 1.f/W : 0.0f;
short sx = convert_short_sat_rte(X0*W); short sx = convert_short_sat_rte(X0*W);
short sy = convert_short_sat_rte(Y0*W); short sy = convert_short_sat_rte(Y0*W);
@ -289,7 +289,7 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -341,7 +341,7 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -424,7 +424,7 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? 1./W : 0.0; W = (W != 0.0f) ? 1.f/W : 0.0f;
short sx = convert_short_sat_rte(X0*W); short sx = convert_short_sat_rte(X0*W);
short sy = convert_short_sat_rte(Y0*W); short sy = convert_short_sat_rte(Y0*W);
@ -447,7 +447,7 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -465,9 +465,9 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
taby[1] = 1.f/INTER_TAB_SIZE*ay; taby[1] = 1.f/INTER_TAB_SIZE*ay;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
tabx[1] = 1.f/INTER_TAB_SIZE*ax; tabx[1] = 1.f/INTER_TAB_SIZE*ax;
tab[0] = taby[0] * tabx[0]; tab[0] = taby[0] * tabx[0];
@ -497,7 +497,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -557,7 +557,7 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W =(W != 0.0)? 1./W : 0.0; W =(W != 0.0f)? 1.f/W : 0.0f;
short sx = convert_short_sat_rte(X0*W); short sx = convert_short_sat_rte(X0*W);
short sy = convert_short_sat_rte(Y0*W); short sy = convert_short_sat_rte(Y0*W);
@ -583,7 +583,7 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);
@ -602,9 +602,9 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
taby[1] = 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0]; tab[0] = taby[0] * tabx[0];
@ -636,7 +636,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
F X0 = M[0]*dx + M[1]*dy + M[2]; F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5]; F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8]; F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
int X = rint(X0*W); int X = rint(X0*W);
int Y = rint(Y0*W); int Y = rint(Y0*W);

@ -192,7 +192,6 @@ __kernel
{ {
const int i = get_local_id(0); // index in workgroup const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0; int pos = 0, same = 0;
const int offset = get_group_id(0) * wg; const int offset = get_group_id(0) * wg;

@ -63,7 +63,7 @@
///////////// parallel merge sort /////////////// ///////////// parallel merge sort ///////////////
// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) static uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
{ {
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
uint firstIndex = left; uint firstIndex = left;
@ -94,7 +94,7 @@ uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
// by a base pointer and left and right index for a particular candidate value. The comparison operator is // by a base pointer and left and right index for a particular candidate value. The comparison operator is
// passed as a functor parameter my_comp // passed as a functor parameter my_comp
// This function returns an index that is the first index whos value would be equal to the searched value // This function returns an index that is the first index whos value would be equal to the searched value
uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) static uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
{ {
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
uint firstIndex = left; uint firstIndex = left;
@ -130,7 +130,7 @@ uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
// passed as a functor parameter my_comp // passed as a functor parameter my_comp
// This function returns an index that is the first index whos value would be greater than the searched value // This function returns an index that is the first index whos value would be greater than the searched value
// If the search value is not found in the sequence, upperbound returns the same result as lowerbound // If the search value is not found in the sequence, upperbound returns the same result as lowerbound
uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) static uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
{ {
uint upperBound = lowerBoundBinary( data, left, right, searchVal ); uint upperBound = lowerBoundBinary( data, left, right, searchVal );
@ -167,9 +167,6 @@ kernel void merge(
) )
{ {
size_t globalID = get_global_id( 0 ); size_t globalID = get_global_id( 0 );
size_t groupID = get_group_id( 0 );
size_t localID = get_local_id( 0 );
size_t wgSize = get_local_size( 0 );
// Abort threads that are passed the end of the input vector // Abort threads that are passed the end of the input vector
if( globalID >= srcVecSize ) if( globalID >= srcVecSize )
@ -230,12 +227,12 @@ kernel void blockInsertionSort(
local V_T* val_lds local V_T* val_lds
) )
{ {
size_t gloId = get_global_id( 0 ); int gloId = get_global_id( 0 );
size_t groId = get_group_id( 0 ); int groId = get_group_id( 0 );
size_t locId = get_local_id( 0 ); int locId = get_local_id( 0 );
size_t wgSize = get_local_size( 0 ); int wgSize = get_local_size( 0 );
bool in_range = gloId < vecSize; bool in_range = gloId < (int)vecSize;
K_T key; K_T key;
V_T val; V_T val;
// Abort threads that are passed the end of the input vector // Abort threads that are passed the end of the input vector
@ -254,7 +251,7 @@ kernel void blockInsertionSort(
{ {
// The last workgroup may have an irregular size, so we calculate a per-block endIndex // The last workgroup may have an irregular size, so we calculate a per-block endIndex
// endIndex is essentially emulating a mod operator with subtraction and multiply // endIndex is essentially emulating a mod operator with subtraction and multiply
size_t endIndex = vecSize - ( groId * wgSize ); int endIndex = vecSize - ( groId * wgSize );
endIndex = min( endIndex, wgSize ); endIndex = min( endIndex, wgSize );
// printf( "Debug: endIndex[%i]=%i\n", groId, endIndex ); // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );

@ -129,58 +129,53 @@ __kernel void knn_find_nearest(__global float* sample, int sample_row, int sampl
} }
/*! find_nearest_neighbor done!*/ /*! find_nearest_neighbor done!*/
/*! write_results start!*/ /*! write_results start!*/
switch (regression) if (regression)
{ {
case true: TYPE s;
{
TYPE s;
#ifdef DOUBLE_SUPPORT #ifdef DOUBLE_SUPPORT
s = 0.0; s = 0.0;
#else #else
s = 0.0f; s = 0.0f;
#endif #endif
for(j = 0; j < K1; j++) for(j = 0; j < K1; j++)
s += nr[j * nThreads + threadY]; s += nr[j * nThreads + threadY];
_results[y * _results_step] = (float)(s * inv_scale); _results[y * _results_step] = (float)(s * inv_scale);
} }
break; else
case false: {
{ int prev_start = 0, best_count = 0, cur_count;
int prev_start = 0, best_count = 0, cur_count; float best_val;
float best_val;
for(j = K1 - 1; j > 0; j--) for(j = K1 - 1; j > 0; j--)
{
bool swap_f1 = false;
for(j1 = 0; j1 < j; j1++)
{ {
bool swap_f1 = false; if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
for(j1 = 0; j1 < j; j1++)
{ {
if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY]) int t;
{ CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
int t; swap_f1 = true;
CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
swap_f1 = true;
}
} }
if(!swap_f1)
break;
} }
if(!swap_f1)
break;
}
best_val = 0; best_val = 0;
for(j = 1; j <= K1; j++) for(j = 1; j <= K1; j++)
if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY]) if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
{
cur_count = j - prev_start;
if(best_count < cur_count)
{ {
cur_count = j - prev_start; best_count = cur_count;
if(best_count < cur_count) best_val = nr[(j - 1) * nThreads + threadY];
{
best_count = cur_count;
best_val = nr[(j - 1) * nThreads + threadY];
}
prev_start = j;
} }
_results[y * _results_step] = best_val; prev_start = j;
} }
break; _results[y * _results_step] = best_val;
} }
///*! write_results done!*/ ///*! write_results done!*/
} }

@ -43,8 +43,6 @@
// //
//M*/ //M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64 #ifdef cl_khr_fp64
@ -70,7 +68,7 @@
#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
// normAcc* are accurate normalization routines which make GPU matchTemplate // normAcc* are accurate normalization routines which make GPU matchTemplate
// consistent with CPU one // consistent with CPU one
float normAcc(float num, float denum) inline float normAcc(float num, float denum)
{ {
if(fabs(num) < denum) if(fabs(num) < denum)
{ {
@ -83,7 +81,7 @@ float normAcc(float num, float denum)
return 0; return 0;
} }
float normAcc_SQDIFF(float num, float denum) inline float normAcc_SQDIFF(float num, float denum)
{ {
if(fabs(num) < denum) if(fabs(num) < denum)
{ {

@ -46,7 +46,7 @@
// //
//M*/ //M*/
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, static short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
__global uchar4* in, int in_step, int dst_off, int src_off, __global uchar4* in, int in_step, int dst_off, int src_off,
int cols, int rows, int sp, int sr, int maxIter, float eps) int cols, int rows, int sp, int sr, int maxIter, float eps)
{ {
@ -56,7 +56,6 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
src_off = src_off >> 2; src_off = src_off >> 2;
dst_off = dst_off >> 2; dst_off = dst_off >> 2;
int idx = src_off + y0 * in_step + x0; int idx = src_off + y0 * in_step + x0;
// uchar4 c = vload4(0, (__global uchar*)in+idx);
uchar4 c = in[idx]; uchar4 c = in[idx];
int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ; int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ;

@ -162,7 +162,6 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
WT4 x3 = (WT4)(0.f); WT4 x3 = (WT4)(0.f);
__global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256; __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256;
bool switchFlag = false;
WT4 p; WT4 p;
WT4 x; WT4 x;
@ -173,7 +172,7 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
if(dy < src_rows) if(dy < src_rows)
{ {
if((x_rest > 0) && (gidx == (get_num_groups(0) - 1))) if((x_rest > 0) && (gidx == ((int)get_num_groups(0) - 1)))
{ {
int i; int i;
for(i = 0; i < x_rest - 4; i += 4) for(i = 0; i < x_rest - 4; i += 4)
@ -190,11 +189,8 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
} }
x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3; x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3; x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
WT x0_ = 0; WT x0_ = 0;
@ -238,11 +234,8 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
} }
x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3; x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3; x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
} }
@ -251,7 +244,7 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
} }
__local WT mom[10][256]; __local WT mom[10][256];
if((y_rest > 0) && (gidy == (get_num_groups(1) - 1))) if((y_rest > 0) && (gidy == ((int)get_num_groups(1) - 1)))
{ {
if(ly < y_rest) if(ly < y_rest)
{ {
@ -268,13 +261,10 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(ly < 10) if(ly < 10)
{
for(int i = 1; i < y_rest; i++) for(int i = 1; i < y_rest; i++)
{
mom[ly][0] = mom[ly][i] + mom[ly][0]; mom[ly][0] = mom[ly][i] + mom[ly][0];
} }
} else
}else
{ {
mom[9][ly] = py * sy; mom[9][ly] = py * sy;
mom[8][ly] = x1.s0 * sy; mom[8][ly] = x1.s0 * sy;
@ -413,11 +403,9 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
if(binary) if(binary)
{ {
WT s = 1./255; WT s = 1.0f/255;
if(ly < 10) if(ly < 10)
{
mom[ly][0] *= s; mom[ly][0] *= s;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
WT xm = (gidx * 256) * mom[0][0]; WT xm = (gidx * 256) * mom[0][0];
@ -440,7 +428,5 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(ly < 10) if(ly < 10)
{
dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1]; dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1];
}
} }

@ -200,7 +200,7 @@ __kernel void normalize_hists_36_kernel(__global float* block_hists,
//------------------------------------------------------------- //-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm // Normalization of histograms via L2Hys_norm
// //
float reduce_smem(volatile __local float* smem, int size) static float reduce_smem(volatile __local float* smem, int size)
{ {
unsigned int tid = get_local_id(0); unsigned int tid = get_local_id(0);
float sum = smem[tid]; float sum = smem[tid];
@ -564,7 +564,6 @@ __kernel void compute_gradients_8UC4_kernel(
const int x = get_global_id(0); const int x = get_global_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
const int gSizeX = get_local_size(0); const int gSizeX = get_local_size(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1); const int gidY = get_group_id(1);
__global const uchar4* row = img + gidY * img_step; __global const uchar4* row = img + gidY * img_step;
@ -667,7 +666,6 @@ __kernel void compute_gradients_8UC1_kernel(
const int x = get_global_id(0); const int x = get_global_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
const int gSizeX = get_local_size(0); const int gSizeX = get_local_size(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1); const int gidY = get_group_id(1);
__global const uchar* row = img + gidY * img_step; __global const uchar* row = img + gidY * img_step;

@ -44,10 +44,10 @@
//M*/ //M*/
#define tx get_local_id(0) #define tx (int)get_local_id(0)
#define ty get_local_id(1) #define ty get_local_id(1)
#define bx get_group_id(0) #define bx get_group_id(0)
#define bdx get_local_size(0) #define bdx (int)get_local_size(0)
#define BORDER_SIZE 5 #define BORDER_SIZE 5
#define MAX_KSIZE_HALF 100 #define MAX_KSIZE_HALF 100

@ -43,32 +43,32 @@
// //
//M*/ //M*/
int idx_row_low(int y, int last_row) inline int idx_row_low(int y, int last_row)
{ {
return abs(y) % (last_row + 1); return abs(y) % (last_row + 1);
} }
int idx_row_high(int y, int last_row) inline int idx_row_high(int y, int last_row)
{ {
return abs(last_row - (int)abs(last_row - y)) % (last_row + 1); return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
} }
int idx_row(int y, int last_row) inline int idx_row(int y, int last_row)
{ {
return idx_row_low(idx_row_high(y, last_row), last_row); return idx_row_low(idx_row_high(y, last_row), last_row);
} }
int idx_col_low(int x, int last_col) inline int idx_col_low(int x, int last_col)
{ {
return abs(x) % (last_col + 1); return abs(x) % (last_col + 1);
} }
int idx_col_high(int x, int last_col) inline int idx_col_high(int x, int last_col)
{ {
return abs(last_col - (int)abs(last_col - x)) % (last_col + 1); return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
} }
int idx_col(int x, int last_col) inline int idx_col(int x, int last_col)
{ {
return idx_col_low(idx_col_high(x, last_col), last_col); return idx_col_low(idx_col_high(x, last_col), last_col);
} }

@ -53,7 +53,8 @@
#define WAVE_SIZE 1 #define WAVE_SIZE 1
#endif #endif
#ifdef CPU #ifdef CPU
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
static void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
smem2[tid] = val2; smem2[tid] = val2;
@ -72,7 +73,7 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local
} }
} }
void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) static void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
smem2[tid] = val2; smem2[tid] = val2;
@ -89,7 +90,7 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
} }
} }
void reduce1(float val1, volatile __local float* smem1, int tid) static void reduce1(float val1, volatile __local float* smem1, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -104,7 +105,7 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
} }
} }
#else #else
void reduce3(float val1, float val2, float val3, static void reduce3(float val1, float val2, float val3,
__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
@ -151,7 +152,7 @@ void reduce3(float val1, float val2, float val3,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) static void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
smem2[tid] = val2; smem2[tid] = val2;
@ -190,7 +191,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
void reduce1(float val1, __local volatile float* smem1, int tid) static void reduce1(float val1, __local volatile float* smem1, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -226,7 +227,7 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
// Image read mode // Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
void SetPatch(image2d_t I, float x, float y, static void SetPatch(image2d_t I, float x, float y,
float* Pch, float* Dx, float* Dy, float* Pch, float* Dx, float* Dy,
float* A11, float* A12, float* A22) float* A11, float* A12, float* A22)
{ {
@ -247,7 +248,7 @@ void SetPatch(image2d_t I, float x, float y,
*A22 += dIdy * dIdy; *A22 += dIdy * dIdy;
} }
void GetPatch(image2d_t J, float x, float y, inline void GetPatch(image2d_t J, float x, float y,
float* Pch, float* Dx, float* Dy, float* Pch, float* Dx, float* Dy,
float* b1, float* b2) float* b1, float* b2)
{ {
@ -257,13 +258,13 @@ void GetPatch(image2d_t J, float x, float y,
*b2 += diff**Dy; *b2 += diff**Dy;
} }
void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
{ {
float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
*errval += fabs(diff); *errval += fabs(diff);
} }
void SetPatch4(image2d_t I, const float x, const float y, static void SetPatch4(image2d_t I, const float x, const float y,
float4* Pch, float4* Dx, float4* Dy, float4* Pch, float4* Dx, float4* Dy,
float* A11, float* A12, float* A22) float* A11, float* A12, float* A22)
{ {
@ -286,7 +287,7 @@ void SetPatch4(image2d_t I, const float x, const float y,
*A22 += sqIdx.x + sqIdx.y + sqIdx.z; *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
} }
void GetPatch4(image2d_t J, const float x, const float y, static void GetPatch4(image2d_t J, const float x, const float y,
const float4* Pch, const float4* Dx, const float4* Dy, const float4* Pch, const float4* Dx, const float4* Dy,
float* b1, float* b2) float* b1, float* b2)
{ {
@ -298,7 +299,7 @@ void GetPatch4(image2d_t J, const float x, const float y,
*b2 += xdiff.x + xdiff.y + xdiff.z; *b2 += xdiff.x + xdiff.y + xdiff.z;
} }
void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) static void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
{ {
float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
@ -318,7 +319,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
unsigned int gid=get_group_id(0); unsigned int gid=get_group_id(0);
unsigned int xsize=get_local_size(0); unsigned int xsize=get_local_size(0);
unsigned int ysize=get_local_size(1); unsigned int ysize=get_local_size(1);
int xBase, yBase, i, j, k; int xBase, yBase, k;
float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
@ -597,7 +598,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
unsigned int gid=get_group_id(0); unsigned int gid=get_group_id(0);
unsigned int xsize=get_local_size(0); unsigned int xsize=get_local_size(0);
unsigned int ysize=get_local_size(1); unsigned int ysize=get_local_size(1);
int xBase, yBase, i, j, k; int xBase, yBase, k;
float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);

@ -183,7 +183,7 @@ __kernel void split_vector(
int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \ int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \
int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \ int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \
int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \ int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \
if (!BYPASS_VSTORE && dst ## xOffsetBytes + sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \ if (!BYPASS_VSTORE && dst ## xOffsetBytes + (int)sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \
{ \ { \
VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \ VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \
} \ } \
@ -192,7 +192,7 @@ __kernel void split_vector(
VEC_TO_ARRAY(vecValue, vecValue##Array); \ VEC_TO_ARRAY(vecValue, vecValue##Array); \
for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \ for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
{ \ { \
if (dst ## xOffsetBytes + sizeof(TYPE) <= dst ## xOffsetLimitBytes) \ if (dst ## xOffsetBytes + (int)sizeof(TYPE) <= dst ## xOffsetLimitBytes) \
*(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \ *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \
else \ else \
break; \ break; \

@ -56,7 +56,7 @@
#define radius 64 #define radius 64
#endif #endif
unsigned int CalcSSD(__local unsigned int *col_ssd) static unsigned int CalcSSD(__local unsigned int *col_ssd)
{ {
unsigned int cache = col_ssd[0]; unsigned int cache = col_ssd[0];
@ -67,7 +67,7 @@ unsigned int CalcSSD(__local unsigned int *col_ssd)
return cache; return cache;
} }
uint2 MinSSD(__local unsigned int *col_ssd) static uint2 MinSSD(__local unsigned int *col_ssd)
{ {
unsigned int ssd[N_DISPARITIES]; unsigned int ssd[N_DISPARITIES];
const int win_size = (radius << 1); const int win_size = (radius << 1);
@ -95,7 +95,7 @@ uint2 MinSSD(__local unsigned int *col_ssd)
return (uint2)(mssd, bestIdx); return (uint2)(mssd, bestIdx);
} }
void StepDown(int idx1, int idx2, __global unsigned char* imageL, static void StepDown(int idx1, int idx2, __global unsigned char* imageL,
__global unsigned char* imageR, int d, __local unsigned int *col_ssd) __global unsigned char* imageR, int d, __local unsigned int *col_ssd)
{ {
uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7))); uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
@ -114,7 +114,7 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
col_ssd[7 * (BLOCK_W + win_size)] += res.s0; col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
} }
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, static void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
__global unsigned char* imageR, int d, __global unsigned char* imageR, int d,
__local unsigned int *col_ssd) __local unsigned int *col_ssd)
{ {
@ -153,7 +153,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
#define Y (get_group_id(1) * ROWSperTHREAD + radius) #define Y (int)(get_group_id(1) * ROWSperTHREAD + radius)
__global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
__global unsigned char* disparImage = disp + X + Y * disp_step; __global unsigned char* disparImage = disp + X + Y * disp_step;
@ -241,7 +241,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned
/////////////////////////////////// Textureness filtering //////////////////////////////////////// /////////////////////////////////// Textureness filtering ////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
float sobel(__global unsigned char *input, int x, int y, int rows, int cols) static float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
{ {
float conv = 0; float conv = 0;
int y1 = y==0? 0 : y-1; int y1 = y==0? 0 : y-1;
@ -256,7 +256,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
return fabs(conv); return fabs(conv);
} }
float CalcSums(__local float *cols, __local float *cols_cache, int winsz) static float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
{ {
unsigned int cache = cols[0]; unsigned int cache = cols[0];

@ -65,7 +65,7 @@
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////common/////////////////////////////////////// /////////////////common///////////////////////////////////////
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
T saturate_cast(float v){ inline T saturate_cast(float v){
#ifdef T_SHORT #ifdef T_SHORT
return convert_short_sat_rte(v); return convert_short_sat_rte(v);
#else #else
@ -73,7 +73,7 @@ T saturate_cast(float v){
#endif #endif
} }
T4 saturate_cast4(float4 v){ inline T4 saturate_cast4(float4 v){
#ifdef T_SHORT #ifdef T_SHORT
return convert_short4_sat_rte(v); return convert_short4_sat_rte(v);
#else #else
@ -99,7 +99,7 @@ inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
return abs((int)(l.x) - *rs); return abs((int)(l.x) - *rs);
} }
float pix_diff_4(const uchar4 l, __global const uchar *rs) static float pix_diff_4(const uchar4 l, __global const uchar *rs)
{ {
uchar4 r; uchar4 r;
r = *((__global uchar4 *)rs); r = *((__global uchar4 *)rs);
@ -235,7 +235,7 @@ __kernel void level_up_message(__global T *src, int src_rows, int src_step,
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_, static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
const __global T *dt, const __global T *dt,
int u_step, int msg_disp_step, int data_disp_step, int u_step, int msg_disp_step, int data_disp_step,
float4 cmax_disc_term, float4 cdisc_single_jump) float4 cmax_disc_term, float4 cdisc_single_jump)

@ -248,7 +248,7 @@ __kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, _
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// init data cost //////////////////////// /////////////////////// init data cost ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
float compute_3(__global uchar* left, __global uchar* right, inline float compute_3(__global uchar* left, __global uchar* right,
float cdata_weight, float cmax_data_term) float cdata_weight, float cmax_data_term)
{ {
float tb = 0.114f * abs((int)left[0] - right[0]); float tb = 0.114f * abs((int)left[0] - right[0]);
@ -257,17 +257,21 @@ float compute_3(__global uchar* left, __global uchar* right,
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
} }
float compute_1(__global uchar* left, __global uchar* right, inline float compute_1(__global uchar* left, __global uchar* right,
float cdata_weight, float cmax_data_term) float cdata_weight, float cmax_data_term)
{ {
return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term); return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term);
} }
short round_short(float v){
inline short round_short(float v)
{
return convert_short_sat_rte(v); return convert_short_sat_rte(v);
} }
/////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////init_data_cost/////////////////////////////////////////////// ///////////////////////////////////init_data_cost///////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////
__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, __kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
int h, int w, int level, int channels, int h, int w, int level, int channels,
int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
@ -993,7 +997,8 @@ __kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////////// init message ///////////////////////// //////////////////////// init message /////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
static void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
__global short *r_new, __global const short *u_cur, __global const short *d_cur, __global short *r_new, __global const short *u_cur, __global const short *d_cur,
__global const short *l_cur, __global const short *r_cur, __global const short *l_cur, __global const short *r_cur,
__global short *data_cost_selected, __global short *disparity_selected_new, __global short *data_cost_selected, __global short *disparity_selected_new,
@ -1027,7 +1032,8 @@ void get_first_k_element_increase_0(__global short* u_new, __global short *d_new
data_cost_new[id * cdisp_step1] = SHRT_MAX; data_cost_new[id * cdisp_step1] = SHRT_MAX;
} }
} }
void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
static void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
__global float *r_new, __global const float *u_cur, __global const float *d_cur, __global float *r_new, __global const float *u_cur, __global const float *d_cur,
__global const float *l_cur, __global const float *r_cur, __global const float *l_cur, __global const float *r_cur,
__global float *data_cost_selected, __global float *disparity_selected_new, __global float *data_cost_selected, __global float *disparity_selected_new,
@ -1190,7 +1196,8 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
static void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
__global const short *msg2, __global const short *msg3, __global const short *msg2, __global const short *msg3,
__global const short *dst_disp, __global const short *src_disp, __global const short *dst_disp, __global const short *src_disp,
int nr_plane, __global short *temp, int nr_plane, __global short *temp,
@ -1226,7 +1233,8 @@ void message_per_pixel_0(__global const short *data, __global short *msg_dst, __
for(int d = 0; d < nr_plane; d++) for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum); msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum);
} }
void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
static void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
__global const float *msg2, __global const float *msg3, __global const float *msg2, __global const float *msg3,
__global const float *dst_disp, __global const float *src_disp, __global const float *dst_disp, __global const float *src_disp,
int nr_plane, __global float *temp, int nr_plane, __global float *temp,
@ -1262,6 +1270,7 @@ void message_per_pixel_1(__global const float *data, __global float *msg_dst, __
for(int d = 0; d < nr_plane; d++) for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum; msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum;
} }
__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_, __kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_,
__global const short *data_cost_selected, __global const short *selected_disp_pyr_cur, __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur,
__global short *ctemp, int h, int w, int nr_plane, int i, __global short *ctemp, int h, int w, int nr_plane, int i,
@ -1293,6 +1302,7 @@ __kernel void compute_message_0(__global short *u_, __global short *d_, __global
cmax_disc_term, cdisp_step1, cdisc_single_jump); cmax_disc_term, cdisp_step1, cdisc_single_jump);
} }
} }
__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_, __kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_,
__global const float *data_cost_selected, __global const float *selected_disp_pyr_cur, __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur,
__global float *ctemp, int h, int w, int nr_plane, int i, __global float *ctemp, int h, int w, int nr_plane, int i,
@ -1327,6 +1337,7 @@ __kernel void compute_message_1(__global float *u_, __global float *d_, __global
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////////// output //////////////////////////// /////////////////////////// output ////////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_, __kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_,
__global const short *r_, __global const short * data_cost_selected, __global const short *r_, __global const short * data_cost_selected,
__global const short *disp_selected_pyr, __global const short *disp_selected_pyr,
@ -1364,6 +1375,7 @@ __kernel void compute_disp_0(__global const short *u_, __global const short *d_,
disp[res_step * y + x] = best; disp[res_step * y + x] = best;
} }
} }
__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_, __kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_,
__global const float *r_, __global const float *data_cost_selected, __global const float *r_, __global const float *data_cost_selected,
__global const float *disp_selected_pyr, __global const float *disp_selected_pyr,

Loading…
Cancel
Save