From 1f9ab2e0cac2d7bfbd2330d5a620eac4fd3e191e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 31 Oct 2013 23:23:56 +0400 Subject: [PATCH] fixed warnings in ocl kernels --- modules/ocl/src/opencl/arithm_bitwise_not.cl | 2 - modules/ocl/src/opencl/arithm_cartToPolar.cl | 22 +- modules/ocl/src/opencl/arithm_minMax.cl | 200 ++++++++--------- modules/ocl/src/opencl/arithm_minMaxLoc.cl | 206 +++++++++--------- .../ocl/src/opencl/arithm_minMaxLoc_mask.cl | 182 ++++++++-------- modules/ocl/src/opencl/arithm_nonzero.cl | 8 +- modules/ocl/src/opencl/arithm_phase.cl | 18 +- modules/ocl/src/opencl/arithm_polarToCart.cl | 13 +- modules/ocl/src/opencl/arithm_sum.cl | 58 ++--- modules/ocl/src/opencl/brute_force_match.cl | 18 +- modules/ocl/src/opencl/cvt_color.cl | 6 +- modules/ocl/src/opencl/haarobjectdetect.cl | 1 - .../src/opencl/haarobjectdetect_scaled2.cl | 1 - modules/ocl/src/opencl/imgproc_calcHarris.cl | 9 +- .../ocl/src/opencl/imgproc_calcMinEigenVal.cl | 9 +- modules/ocl/src/opencl/imgproc_canny.cl | 12 +- modules/ocl/src/opencl/imgproc_clahe.cl | 12 +- modules/ocl/src/opencl/imgproc_integral.cl | 24 +- .../ocl/src/opencl/imgproc_integral_sum.cl | 16 +- modules/ocl/src/opencl/imgproc_median.cl | 32 +-- modules/ocl/src/opencl/imgproc_remap.cl | 18 +- modules/ocl/src/opencl/imgproc_resize.cl | 24 +- modules/ocl/src/opencl/imgproc_threshold.cl | 20 +- modules/ocl/src/opencl/imgproc_warpAffine.cl | 8 +- .../ocl/src/opencl/imgproc_warpPerspective.cl | 34 +-- modules/ocl/src/opencl/kernel_sort_by_key.cl | 1 - .../src/opencl/kernel_stablesort_by_key.cl | 21 +- modules/ocl/src/opencl/knearest.cl | 73 +++---- modules/ocl/src/opencl/match_template.cl | 6 +- modules/ocl/src/opencl/meanShift.cl | 3 +- modules/ocl/src/opencl/moments.cl | 24 +- modules/ocl/src/opencl/objdetect_hog.cl | 4 +- .../ocl/src/opencl/optical_flow_farneback.cl | 4 +- modules/ocl/src/opencl/pyr_down.cl | 12 +- modules/ocl/src/opencl/pyrlk.cl | 29 +-- modules/ocl/src/opencl/split_mat.cl | 4 +- modules/ocl/src/opencl/stereobm.cl | 14 +- modules/ocl/src/opencl/stereobp.cl | 8 +- modules/ocl/src/opencl/stereocsbp.cl | 26 ++- 39 files changed, 578 insertions(+), 604 deletions(-) diff --git a/modules/ocl/src/opencl/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl index e5b46c9368..5bc1839d6a 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_not.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl @@ -67,7 +67,6 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; int src1_index = mad24(y, src1_step, x + src1_offset); - int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x); @@ -97,7 +96,6 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src x = x << 2; int src1_index = mad24(y, src1_step, x + src1_offset); - int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x); diff --git a/modules/ocl/src/opencl/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl index 6c779ead90..e37818c40f 100644 --- a/modules/ocl/src/opencl/arithm_cartToPolar.cl +++ b/modules/ocl/src/opencl/arithm_cartToPolar.cl @@ -44,14 +44,18 @@ //M*/ #if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable + #pragma OPENCL EXTENSION cl_khr_fp64:enable + #define CV_PI 3.1415926535897932384626433832795 + #ifndef DBL_EPSILON + #define DBL_EPSILON 0x1.0p-52 + #endif +#else + #define CV_PI 3.1415926535897932384626433832795f + #ifndef DBL_EPSILON + #define DBL_EPSILON 0x1.0p-52f + #endif #endif -#define CV_PI 3.1415926535897932384626433832795 - -#ifndef DBL_EPSILON -#define DBL_EPSILON 0x1.0p-52 -#endif __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset, __global float *src2, int src2_step, int src2_offset, @@ -82,9 +86,9 @@ __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int sr float tmp = y >= 0 ? 0 : CV_PI*2; tmp = x < 0 ? CV_PI : tmp; - float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5; - cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON) + tmp : - tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON); + float tmp1 = y >= 0 ? CV_PI*0.5f : CV_PI*1.5f; + cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + DBL_EPSILON) + tmp : + tmp1 - x*y/(y2 + 0.28f*x2 + DBL_EPSILON); cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI); diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl index 35f4cdd700..33a39d83f3 100644 --- a/modules/ocl/src/opencl/arithm_minMax.cl +++ b/modules/ocl/src/opencl/arithm_minMax.cl @@ -66,53 +66,53 @@ __kernel void arithm_op_minMax(__global const T * src, __global T * dst, int cols, int invalid_cols, int offset, int elemnum, int groupnum) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - - unsigned int idx = offset + id + (id / cols) * invalid_cols; - - __local T localmem_max[128], localmem_min[128]; - T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; - - for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) - { - idx = offset + id + (id / cols) * invalid_cols; - temp = src[idx]; - minval = min(minval, temp); - maxval = max(maxval, temp); - } - - if (lid > 127) - { - localmem_min[lid - 128] = minval; - localmem_max[lid - 128] = maxval; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid < 128) - { - localmem_min[lid] = min(minval, localmem_min[lid]); - localmem_max[lid] = max(maxval, localmem_max[lid]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int lsize = 64; lsize > 0; lsize >>= 1) - { - if (lid < lsize) - { - int lid2 = lsize + lid; - localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); - localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (lid == 0) - { - dst[gid] = localmem_min[0]; - dst[gid + groupnum] = localmem_max[0]; - } + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); + + int idx = offset + id + (id / cols) * invalid_cols; + + __local T localmem_max[128], localmem_min[128]; + T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + temp = src[idx]; + minval = min(minval, temp); + maxval = max(maxval, temp); + } + + if (lid > 127) + { + localmem_min[lid - 128] = minval; + localmem_max[lid - 128] = maxval; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid < 128) + { + localmem_min[lid] = min(minval, localmem_min[lid]); + localmem_max[lid] = max(maxval, localmem_max[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = 64; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); + localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + dst[gid] = localmem_min[0]; + dst[gid + groupnum] = localmem_max[0]; + } } __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst, @@ -120,57 +120,57 @@ __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst, int elemnum, int groupnum, const __global uchar * mask, int minvalid_cols, int moffset) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - - unsigned int idx = offset + id + (id / cols) * invalid_cols; - unsigned int midx = moffset + id + (id / cols) * minvalid_cols; - - __local T localmem_max[128], localmem_min[128]; - T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; - - for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) - { - idx = offset + id + (id / cols) * invalid_cols; - midx = moffset + id + (id / cols) * minvalid_cols; - - if (mask[midx]) - { - temp = src[idx]; - minval = min(minval, temp); - maxval = max(maxval, temp); - } - } - - if (lid > 127) - { - localmem_min[lid - 128] = minval; - localmem_max[lid - 128] = maxval; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid < 128) - { - localmem_min[lid] = min(minval, localmem_min[lid]); - localmem_max[lid] = max(maxval, localmem_max[lid]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int lsize = 64; lsize > 0; lsize >>= 1) - { - if (lid < lsize) - { - int lid2 = lsize + lid; - localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); - localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (lid == 0) - { - dst[gid] = localmem_min[0]; - dst[gid + groupnum] = localmem_max[0]; - } + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); + + int idx = offset + id + (id / cols) * invalid_cols; + int midx = moffset + id + (id / cols) * minvalid_cols; + + __local T localmem_max[128], localmem_min[128]; + T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + midx = moffset + id + (id / cols) * minvalid_cols; + + if (mask[midx]) + { + temp = src[idx]; + minval = min(minval, temp); + maxval = max(maxval, temp); + } + } + + if (lid > 127) + { + localmem_min[lid - 128] = minval; + localmem_max[lid - 128] = maxval; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid < 128) + { + localmem_min[lid] = min(minval, localmem_min[lid]); + localmem_max[lid] = max(maxval, localmem_max[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = 64; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); + localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + dst[gid] = localmem_min[0]; + dst[gid + groupnum] = localmem_max[0]; + } } diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl index 21f95611b5..076fb06001 100644 --- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl +++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl @@ -137,118 +137,114 @@ #define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; #endif - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable - /**************************************Array minMax**************************************/ __kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum, __global VEC_TYPE *src, __global RES_TYPE *dst) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - unsigned int idx = offset + id + (id / cols) * invalid_cols; - - __local VEC_TYPE localmem_max[128], localmem_min[128]; - VEC_TYPE minval, maxval, temp; - - __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128]; - VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1; - - int idx_c; - - if (id < elemnum) - { - temp = src[idx]; - idx_c = idx << 2; - temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3); - - if (id % cols == 0 ) - { - repeat_s(temp); - repeat_s(temploc); - } - if (id % cols == cols - 1) - { - repeat_e(temp); - repeat_e(temploc); - } - minval = temp; - maxval = temp; - minloc = temploc; - maxloc = temploc; - } - else - { - minval = MAX_VAL; - maxval = MIN_VAL; - minloc = negative; - maxloc = negative; - } - - int grainSize = (groupnum << 8); - for (id = id + grainSize; id < elemnum; id = id + grainSize) - { - idx = offset + id + (id / cols) * invalid_cols; - temp = src[idx]; - idx_c = idx << 2; - temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3); - - if (id % cols == 0 ) - { - repeat_s(temp); - repeat_s(temploc); - } - if (id % cols == cols - 1) - { - repeat_e(temp); - repeat_e(temploc); - } - - minval = min(minval, temp); - maxval = max(maxval, temp); - minloc = CONDITION_FUNC(minval == temp, temploc, minloc); - maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc); - } - - if (lid > 127) - { - localmem_min[lid - 128] = minval; - localmem_max[lid - 128] = maxval; - localmem_minloc[lid - 128] = minloc; - localmem_maxloc[lid - 128] = maxloc; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid < 128) - { - localmem_min[lid] = min(minval,localmem_min[lid]); - localmem_max[lid] = max(maxval,localmem_max[lid]); - localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]); - localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (int lsize = 64; lsize > 0; lsize >>= 1) - { + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); + int idx = offset + id + (id / cols) * invalid_cols; + + __local VEC_TYPE localmem_max[128], localmem_min[128]; + VEC_TYPE minval, maxval, temp; + + __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128]; + VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1; + + int idx_c; + + if (id < elemnum) + { + temp = src[idx]; + idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3); + + if (id % cols == 0 ) + { + repeat_s(temp); + repeat_s(temploc); + } + if (id % cols == cols - 1) + { + repeat_e(temp); + repeat_e(temploc); + } + minval = temp; + maxval = temp; + minloc = temploc; + maxloc = temploc; + } + else + { + minval = MAX_VAL; + maxval = MIN_VAL; + minloc = negative; + maxloc = negative; + } + + int grainSize = (groupnum << 8); + for (id = id + grainSize; id < elemnum; id = id + grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + temp = src[idx]; + idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3); + + if (id % cols == 0 ) + { + repeat_s(temp); + repeat_s(temploc); + } + if (id % cols == cols - 1) + { + repeat_e(temp); + repeat_e(temploc); + } + + minval = min(minval, temp); + maxval = max(maxval, temp); + minloc = CONDITION_FUNC(minval == temp, temploc, minloc); + maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc); + } + + if (lid > 127) + { + localmem_min[lid - 128] = minval; + localmem_max[lid - 128] = maxval; + localmem_minloc[lid - 128] = minloc; + localmem_maxloc[lid - 128] = maxloc; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid < 128) + { + localmem_min[lid] = min(minval,localmem_min[lid]); + localmem_max[lid] = max(maxval,localmem_max[lid]); + localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]); + localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = 64; lsize > 0; lsize >>= 1) + { if (lid < lsize) { - int lid2 = lsize + lid; - localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); - localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); - localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]); - localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]); + int lid2 = lsize + lid; + localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); + localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); + localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]); + localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]); } barrier(CLK_LOCAL_MEM_FENCE); - } - - if ( lid == 0) - { - dst[gid] = CONVERT_RES_TYPE(localmem_min[0]); - dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]); - dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]); - dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]); - } + } + + if ( lid == 0) + { + dst[gid] = CONVERT_RES_TYPE(localmem_min[0]); + dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]); + dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]); + dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]); + } } diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl index 6d514e99d3..4d73be9541 100644 --- a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl +++ b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl @@ -147,96 +147,96 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src, int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE *dst) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - unsigned int idx = id + (id / cols) * invalid_cols; - unsigned int midx = id + (id / cols) * minvalid_cols; - __local VEC_TYPE lm_max[128],lm_min[128]; - VEC_TYPE minval,maxval,temp,m_temp; - __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128]; - VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0; - if(id < elemnum) - { - temp = vload4(idx, &src[offset]); - m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); - int idx_c = (idx << 2) + offset; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); - if(id % cols == cols - 1) - { - repeat_me(m_temp); - repeat_e(temploc); - } - minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL; - maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL; - minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative); - maxloc = minloc; - } - else - { - minval = MAX_VAL; - maxval = MIN_VAL; - minloc = negative; - maxloc = negative; - } - for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) - { - idx = id + (id / cols) * invalid_cols; - midx = id + (id / cols) * minvalid_cols; - temp = vload4(idx, &src[offset]); - m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); - int idx_c = (idx << 2) + offset; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); - if(id % cols == cols - 1) - { - repeat_me(m_temp); - repeat_e(temploc); - } - minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval); - maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval); + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); + int idx = id + (id / cols) * invalid_cols; + int midx = id + (id / cols) * minvalid_cols; + __local VEC_TYPE lm_max[128],lm_min[128]; + VEC_TYPE minval,maxval,temp,m_temp; + __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128]; + VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0; + if(id < elemnum) + { + temp = vload4(idx, &src[offset]); + m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); + int idx_c = (idx << 2) + offset; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); + if(id % cols == cols - 1) + { + repeat_me(m_temp); + repeat_e(temploc); + } + minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL; + maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL; + minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative); + maxloc = minloc; + } + else + { + minval = MAX_VAL; + maxval = MIN_VAL; + minloc = negative; + maxloc = negative; + } + for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) + { + idx = id + (id / cols) * invalid_cols; + midx = id + (id / cols) * minvalid_cols; + temp = vload4(idx, &src[offset]); + m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); + int idx_c = (idx << 2) + offset; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); + if(id % cols == cols - 1) + { + repeat_me(m_temp); + repeat_e(temploc); + } + minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval); + maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval); - minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc); - maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc); - } - if(lid > 127) - { - lm_min[lid - 128] = minval; - lm_max[lid - 128] = maxval; - lm_minloc[lid - 128] = minloc; - lm_maxloc[lid - 128] = maxloc; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 128) - { - lm_min[lid] = min(minval,lm_min[lid]); - lm_max[lid] = max(maxval,lm_max[lid]); - VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero); - VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero); - lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]); - lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]); - } - barrier(CLK_LOCAL_MEM_FENCE); - for(int lsize = 64; lsize > 0; lsize >>= 1) - { - if(lid < lsize) - { - int lid2 = lsize + lid; - lm_min[lid] = min(lm_min[lid] , lm_min[lid2]); - lm_max[lid] = max(lm_max[lid] , lm_max[lid2]); - VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero); - VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero); - lm_minloc[lid] = - CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]); - lm_maxloc[lid] = - CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if( lid == 0) - { - dst[gid] = CONVERT_RES_TYPE(lm_min[0]); - dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]); - dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]); - dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); - } + minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc); + maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc); + } + if(lid > 127) + { + lm_min[lid - 128] = minval; + lm_max[lid - 128] = maxval; + lm_minloc[lid - 128] = minloc; + lm_maxloc[lid - 128] = maxloc; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 128) + { + lm_min[lid] = min(minval,lm_min[lid]); + lm_max[lid] = max(maxval,lm_max[lid]); + VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero); + VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero); + lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]); + lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(int lsize = 64; lsize > 0; lsize >>= 1) + { + if(lid < lsize) + { + int lid2 = lsize + lid; + lm_min[lid] = min(lm_min[lid] , lm_min[lid2]); + lm_max[lid] = max(lm_max[lid] , lm_max[lid2]); + VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero); + VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero); + lm_minloc[lid] = + CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]); + lm_maxloc[lid] = + CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if( lid == 0) + { + dst[gid] = CONVERT_RES_TYPE(lm_min[0]); + dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]); + dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]); + dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); + } } diff --git a/modules/ocl/src/opencl/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl index 085386f5c3..fc98257962 100644 --- a/modules/ocl/src/opencl/arithm_nonzero.cl +++ b/modules/ocl/src/opencl/arithm_nonzero.cl @@ -55,11 +55,11 @@ __kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum, __global srcT *src, __global dstT *dst) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); - unsigned int idx = offset + id + (id / cols) * invalid_cols; + int idx = offset + id + (id / cols) * invalid_cols; __local dstT localmem_nonzero[128]; dstT nonzero = (dstT)(0); srcT zero = (srcT)(0), one = (srcT)(1); diff --git a/modules/ocl/src/opencl/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl index b6bc7b42b4..f9835948c4 100644 --- a/modules/ocl/src/opencl/arithm_phase.cl +++ b/modules/ocl/src/opencl/arithm_phase.cl @@ -45,15 +45,17 @@ // #if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable + #ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64:enable + #elif defined (cl_amd_fp64) + #pragma OPENCL EXTENSION cl_amd_fp64:enable + #endif + #define CV_PI 3.1415926535897932384626433832795 + #define CV_2PI 2*CV_PI +#else + #define CV_PI 3.1415926535897932384626433832795f + #define CV_2PI 2*CV_PI #endif -#endif - -#define CV_PI 3.1415926535898 -#define CV_2PI 2*3.1415926535898 /**************************************phase inradians**************************************/ diff --git a/modules/ocl/src/opencl/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl index 8af840db82..8469cdb097 100644 --- a/modules/ocl/src/opencl/arithm_polarToCart.cl +++ b/modules/ocl/src/opencl/arithm_polarToCart.cl @@ -43,12 +43,13 @@ // //M*/ -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable +#ifdef DOUBLE_SUPPORT + #pragma OPENCL EXTENSION cl_khr_fp64:enable + #define CV_PI 3.1415926535897932384626433832795 +#else + #define CV_PI 3.1415926535897932384626433832795f #endif -#define CV_PI 3.1415926535897932384626433832795 - ///////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////polarToCart with magnitude////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -72,7 +73,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in float x = *((__global float *)((__global char *)src1 + src1_index)); float y = *((__global float *)((__global char *)src2 + src2_index)); - float ascale = CV_PI/180.0; + float ascale = CV_PI/180.0f; float alpha = angInDegree == 1 ? y * ascale : y; float a = cos(alpha) * x; float b = sin(alpha) * x; @@ -134,7 +135,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr float y = *((__global float *)((__global char *)src + src_index)); - float ascale = CV_PI/180.0; + float ascale = CV_PI/180.0f; float alpha = angInDegree == 1 ? y * ascale : y; float a = cos(alpha); float b = sin(alpha); diff --git a/modules/ocl/src/opencl/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl index 6eb6e48323..7ada5be4c1 100644 --- a/modules/ocl/src/opencl/arithm_sum.cl +++ b/modules/ocl/src/opencl/arithm_sum.cl @@ -66,39 +66,39 @@ __kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, __global srcT *src, __global dstT *dst) { - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - unsigned int idx = offset + id + (id / cols) * invalid_cols; + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0); + int idx = offset + id + (id / cols) * invalid_cols; - __local dstT localmem_sum[128]; - dstT sum = (dstT)(0), temp; + __local dstT localmem_sum[128]; + dstT sum = (dstT)(0), temp; - for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) - { - idx = offset + id + (id / cols) * invalid_cols; - temp = convertToDstT(src[idx]); - FUNC(temp, sum); - } + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + temp = convertToDstT(src[idx]); + FUNC(temp, sum); + } - if (lid > 127) - localmem_sum[lid - 128] = sum; - barrier(CLK_LOCAL_MEM_FENCE); + if (lid > 127) + localmem_sum[lid - 128] = sum; + barrier(CLK_LOCAL_MEM_FENCE); - if (lid < 128) - localmem_sum[lid] = sum + localmem_sum[lid]; - barrier(CLK_LOCAL_MEM_FENCE); + if (lid < 128) + localmem_sum[lid] = sum + localmem_sum[lid]; + barrier(CLK_LOCAL_MEM_FENCE); - for (int lsize = 64; lsize > 0; lsize >>= 1) - { - if (lid < lsize) - { - int lid2 = lsize + lid; - localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } + for (int lsize = 64; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } - if (lid == 0) - dst[gid] = localmem_sum[0]; + if (lid == 0) + dst[gid] = localmem_sum[0]; } diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 8f85f7d936..ce0d86e8a4 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -64,7 +64,7 @@ #endif //http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel -int bit1Count(int v) +static int bit1Count(int v) { v = v - ((v >> 1) & 0x55555555); // reuse input as temporary v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp @@ -95,7 +95,7 @@ typedef int result_type; #define DIST_RES(x) (x) #endif -result_type reduce_block( +static result_type reduce_block( __local value_type *s_query, __local value_type *s_train, int lidx, @@ -113,7 +113,7 @@ result_type reduce_block( return DIST_RES(result); } -result_type reduce_block_match( +static result_type reduce_block_match( __local value_type *s_query, __local value_type *s_train, int lidx, @@ -131,7 +131,7 @@ result_type reduce_block_match( return (result); } -result_type reduce_multi_block( +static result_type reduce_multi_block( __local value_type *s_query, __local value_type *s_train, int block_index, @@ -187,7 +187,6 @@ __kernel void BruteForceMatch_UnrollMatch( int myBestTrainIdx = -1; // loopUnrolledCached to find the best trainIdx and best distance. - volatile int imgIdx = 0; for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) { result_type result = 0; @@ -212,7 +211,6 @@ __kernel void BruteForceMatch_UnrollMatch( if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) { - //bestImgIdx = imgIdx; myBestDistance = result; myBestTrainIdx = trainIdx; } @@ -304,7 +302,6 @@ __kernel void BruteForceMatch_Match( if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) { - //myBestImgidx = imgIdx; myBestDistance = result; myBestTrainIdx = trainIdx; } @@ -390,11 +387,10 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); if(ind < bestTrainIdx_cols) { - //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; } @@ -451,11 +447,10 @@ __kernel void BruteForceMatch_RadiusMatch( if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) { - unsigned int ind = atom_inc(nMatches + queryIdx); + int ind = atom_inc(nMatches + queryIdx); if(ind < bestTrainIdx_cols) { - //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; } @@ -498,7 +493,6 @@ __kernel void BruteForceMatch_knnUnrollMatch( int myBestTrainIdx2 = -1; //loopUnrolledCached - volatile int imgIdx = 0; for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { result_type result = 0; diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl index fcbf67ca7a..01286f7ad7 100644 --- a/modules/ocl/src/opencl/cvt_color.cl +++ b/modules/ocl/src/opencl/cvt_color.cl @@ -50,8 +50,6 @@ #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif -#define DATA_TYPE UNDEFINED - #if defined (DEPTH_0) #define DATA_TYPE uchar #define MAX_NUM 255 @@ -73,6 +71,10 @@ #define SAT_CAST(num) (num) #endif +#ifndef DATA_TYPE + #define DATA_TYPE UNDEFINED +#endif + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) enum diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 1d53f2b880..9e4ab2fe71 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -37,7 +37,6 @@ // // -#pragma OPENCL EXTENSION cl_amd_printf : enable #define CV_HAAR_FEATURE_MAX 3 #define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset]) diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl index 17e95b4e4a..b7a8ce1379 100644 --- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl +++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl @@ -120,7 +120,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( int grpidx = get_group_id(0); int lclidx = get_local_id(0); int lclidy = get_local_id(1); - int lcl_sz = mul24(grpszx, grpszy); int lcl_id = mad24(lclidy, grpszx, lclidx); __local int glboutindex[1]; __local int lclcount[1]; diff --git a/modules/ocl/src/opencl/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl index bf54d3867d..0a981e12e8 100644 --- a/modules/ocl/src/opencl/imgproc_calcHarris.cl +++ b/modules/ocl/src/opencl/imgproc_calcHarris.cl @@ -99,7 +99,6 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g int col = get_local_id(0); int gX = get_group_id(0); int gY = get_group_id(1); - int glx = get_global_id(0); int gly = get_global_id(1); int dx_x_off = (dx_offset % dx_step) >> 2; @@ -126,11 +125,11 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g { dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; - dx_data[i] = dx_con ? dx_s : 0.0; + dx_data[i] = dx_con ? dx_s : 0.0f; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; - dy_data[i] = dy_con ? dy_s : 0.0; + dy_data[i] = dy_con ? dy_s : 0.0f; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; @@ -155,7 +154,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g data[2][i] = dy_data[i] * dy_data[i]; } #endif - float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f; for (int i=1; i < ksY; i++) { sum0 += data[0][i]; @@ -183,7 +182,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g int posX = dst_startX - dst_x_off + col - anX; int posY = (gly << 1); int till = (ksX + 1)%2; - float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; + float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; for (int k=0; k<6; k++) for (int i=-anX; i<=anX - till; i++) tmp_sum[k] += temp[k][col+i]; diff --git a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl index 5f39176e99..110d204a59 100644 --- a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl +++ b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl @@ -98,7 +98,6 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, int col = get_local_id(0); int gX = get_group_id(0); int gY = get_group_id(1); - int glx = get_global_id(0); int gly = get_global_id(1); int dx_x_off = (dx_offset % dx_step) >> 2; @@ -125,10 +124,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, { dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; - dx_data[i] = dx_con ? dx_s : 0.0; + dx_data[i] = dx_con ? dx_s : 0.0f; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; - dy_data[i] = dy_con ? dy_s : 0.0; + dy_data[i] = dy_con ? dy_s : 0.0f; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; data[2][i] = dy_data[i] * dy_data[i]; @@ -152,7 +151,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, data[2][i] = dy_data[i] * dy_data[i]; } #endif - float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f; for (int i=1; i < ksY; i++) { sum0 += (data[0][i]); @@ -180,7 +179,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, int posX = dst_startX - dst_x_off + col - anX; int posY = (gly << 1); int till = (ksX + 1)%2; - float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; + float tmp_sum[6] = { 0.0f, 0.0f , 0.0f, 0.0f, 0.0f, 0.0f }; for (int k=0; k<6; k++) for (int i=-anX; i<=anX - till; i++) tmp_sum[k] += temp[k][col+i]; diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl index c77cae99a3..0a54f1468c 100644 --- a/modules/ocl/src/opencl/imgproc_canny.cl +++ b/modules/ocl/src/opencl/imgproc_canny.cl @@ -43,9 +43,6 @@ // //M*/ -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - #ifdef L2GRAD inline float calc(int x, int y) { @@ -248,7 +245,12 @@ void calcMagnitude ////////////////////////////////////////////////////////////////////////////////////////// // 0.4142135623730950488016887242097 is tan(22.5) #define CANNY_SHIFT 15 -#define TG22 (int)(0.4142135623730950488016887242097*(1<=src_cols ? x=src_cols-1,u=0 : x,u; - y<0 ? y=0,v=0 : y,v; - y>=src_rows ? y=src_rows-1,v=0 : y,v; + if ( x<0 ) x=0,u=0; + if ( x>=src_cols ) x=src_cols-1,u=0; + if ( y<0 ) y=0,v=0; + if (y>=src_rows ) y=src_rows-1,v=0; u = u * INTER_RESIZE_COEF_SCALE; v = v * INTER_RESIZE_COEF_SCALE; @@ -225,10 +225,10 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src, int x = floor(sx), y = floor(sy); float u = sx - x, v = sy - y; - x<0 ? x=0,u=0 : x,u; - x>=src_cols ? x=src_cols-1,u=0 : x,u; - y<0 ? y=0,v=0 : y,v; - y>=src_rows ? y=src_rows-1,v=0 : y,v; + if ( x<0 ) x=0,u=0; + if ( x>=src_cols ) x=src_cols-1,u=0; + if ( y<0 ) y=0,v=0; + if (y>=src_rows ) y=src_rows-1,v=0; int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); @@ -264,10 +264,10 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, int x = floor(sx), y = floor(sy); float u = sx - x, v = sy - y; - x<0 ? x=0,u=0 : x; - x>=src_cols ? x=src_cols-1,u=0 : x; - y<0 ? y=0,v=0 : y; - y>=src_rows ? y=src_rows-1,v=0 : y; + if ( x<0 ) x=0,u=0; + if ( x>=src_cols ) x=src_cols-1,u=0; + if ( y<0 ) y=0,v=0; + if (y>=src_rows ) y=src_rows-1,v=0; int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl index 6b847c83f8..400ac806cf 100644 --- a/modules/ocl/src/opencl/imgproc_threshold.cl +++ b/modules/ocl/src/opencl/imgproc_threshold.cl @@ -71,18 +71,18 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src #else VT sdata = VLOADN(0, src + src_index); #endif - VT vthresh = (VT)(thresh), zero = (VT)(0); + VT vthresh = (VT)(thresh); #ifdef THRESH_BINARY - VT vecValue = sdata > vthresh ? max_val : zero; + VT vecValue = sdata > vthresh ? max_val : (VT)(0); #elif defined THRESH_BINARY_INV - VT vecValue = sdata > vthresh ? zero : max_val; + VT vecValue = sdata > vthresh ? (VT)(0) : max_val; #elif defined THRESH_TRUNC VT vecValue = sdata > vthresh ? thresh : sdata; #elif defined THRESH_TOZERO - VT vecValue = sdata > vthresh ? sdata : zero; + VT vecValue = sdata > vthresh ? sdata : (VT)(0); #elif defined THRESH_TOZERO_INV - VT vecValue = sdata > vthresh ? zero : sdata; + VT vecValue = sdata > vthresh ? (VT)(0) : sdata; #endif if (gx + VECSIZE <= max_index) @@ -117,18 +117,18 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src int src_index = mad24(gy, src_step, src_offset + gx); int dst_index = mad24(gy, dst_step, dst_offset + gx); - T sdata = src[src_index], zero = (T)(0); + T sdata = src[src_index]; #ifdef THRESH_BINARY - dst[dst_index] = sdata > thresh ? max_val : zero; + dst[dst_index] = sdata > thresh ? max_val : (T)(0); #elif defined THRESH_BINARY_INV - dst[dst_index] = sdata > thresh ? zero : max_val; + dst[dst_index] = sdata > thresh ? (T)(0) : max_val; #elif defined THRESH_TRUNC dst[dst_index] = sdata > thresh ? thresh : sdata; #elif defined THRESH_TOZERO - dst[dst_index] = sdata > thresh ? sdata : zero; + dst[dst_index] = sdata > thresh ? sdata : (T)(0); #elif defined THRESH_TOZERO_INV - dst[dst_index] = sdata > thresh ? zero : sdata; + dst[dst_index] = sdata > thresh ? (T)(0) : sdata; #endif } } diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl index caafdfb92c..a5050bbf03 100644 --- a/modules/ocl/src/opencl/imgproc_warpAffine.cl +++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl @@ -537,9 +537,9 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, float tab[4]; float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tab[0] = taby[0] * tabx[0]; @@ -680,9 +680,9 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds float tab[4]; float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tab[0] = taby[0] * tabx[0]; diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl index dc37c1f04d..eee1c81750 100644 --- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl +++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl @@ -133,7 +133,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -150,9 +150,9 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ short itab[4]; float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + tab1y[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay; tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tab1x[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax; tab1x[1] = 1.f/INTER_TAB_SIZE*ax; #pragma unroll 4 @@ -185,7 +185,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -265,7 +265,7 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; + W = (W != 0.0f) ? 1.f/W : 0.0f; short sx = convert_short_sat_rte(X0*W); short sy = convert_short_sat_rte(Y0*W); @@ -289,7 +289,7 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -341,7 +341,7 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -424,7 +424,7 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; + W = (W != 0.0f) ? 1.f/W : 0.0f; short sx = convert_short_sat_rte(X0*W); short sy = convert_short_sat_rte(Y0*W); @@ -447,7 +447,7 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -465,9 +465,9 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * float tab[4]; float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay; taby[1] = 1.f/INTER_TAB_SIZE*ay; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax; tabx[1] = 1.f/INTER_TAB_SIZE*ax; tab[0] = taby[0] * tabx[0]; @@ -497,7 +497,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -557,7 +557,7 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W =(W != 0.0)? 1./W : 0.0; + W =(W != 0.0f)? 1.f/W : 0.0f; short sx = convert_short_sat_rte(X0*W); short sy = convert_short_sat_rte(Y0*W); @@ -583,7 +583,7 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); @@ -602,9 +602,9 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 float tab[4]; float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0; taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0; tabx[1] = 1.f/INTER_TAB_SIZE*ax0; tab[0] = taby[0] * tabx[0]; @@ -636,7 +636,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 F X0 = M[0]*dx + M[1]*dy + M[2]; F Y0 = M[3]*dx + M[4]*dy + M[5]; F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f; int X = rint(X0*W); int Y = rint(Y0*W); diff --git a/modules/ocl/src/opencl/kernel_sort_by_key.cl b/modules/ocl/src/opencl/kernel_sort_by_key.cl index 0ad11b8bcf..0e8d581b74 100644 --- a/modules/ocl/src/opencl/kernel_sort_by_key.cl +++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl @@ -192,7 +192,6 @@ __kernel { const int i = get_local_id(0); // index in workgroup const int numOfGroups = get_num_groups(0); // index in workgroup - const int groupID = get_group_id(0); const int wg = get_local_size(0); // workgroup size = block size int pos = 0, same = 0; const int offset = get_group_id(0) * wg; diff --git a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl index 2d2c0a19cd..2d38fbf2f7 100644 --- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl +++ b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl @@ -63,7 +63,7 @@ ///////////// parallel merge sort /////////////// // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl -uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) +static uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) { // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence uint firstIndex = left; @@ -94,7 +94,7 @@ uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) // by a base pointer and left and right index for a particular candidate value. The comparison operator is // passed as a functor parameter my_comp // This function returns an index that is the first index whos value would be equal to the searched value -uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) +static uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) { // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence uint firstIndex = left; @@ -130,7 +130,7 @@ uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) // passed as a functor parameter my_comp // This function returns an index that is the first index whos value would be greater than the searched value // If the search value is not found in the sequence, upperbound returns the same result as lowerbound -uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) +static uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) { uint upperBound = lowerBoundBinary( data, left, right, searchVal ); @@ -167,9 +167,6 @@ kernel void merge( ) { size_t globalID = get_global_id( 0 ); - size_t groupID = get_group_id( 0 ); - size_t localID = get_local_id( 0 ); - size_t wgSize = get_local_size( 0 ); // Abort threads that are passed the end of the input vector if( globalID >= srcVecSize ) @@ -230,12 +227,12 @@ kernel void blockInsertionSort( local V_T* val_lds ) { - size_t gloId = get_global_id( 0 ); - size_t groId = get_group_id( 0 ); - size_t locId = get_local_id( 0 ); - size_t wgSize = get_local_size( 0 ); + int gloId = get_global_id( 0 ); + int groId = get_group_id( 0 ); + int locId = get_local_id( 0 ); + int wgSize = get_local_size( 0 ); - bool in_range = gloId < vecSize; + bool in_range = gloId < (int)vecSize; K_T key; V_T val; // Abort threads that are passed the end of the input vector @@ -254,7 +251,7 @@ kernel void blockInsertionSort( { // The last workgroup may have an irregular size, so we calculate a per-block endIndex // endIndex is essentially emulating a mod operator with subtraction and multiply - size_t endIndex = vecSize - ( groId * wgSize ); + int endIndex = vecSize - ( groId * wgSize ); endIndex = min( endIndex, wgSize ); // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex ); diff --git a/modules/ocl/src/opencl/knearest.cl b/modules/ocl/src/opencl/knearest.cl index e670df7e6f..bc0ae89a83 100644 --- a/modules/ocl/src/opencl/knearest.cl +++ b/modules/ocl/src/opencl/knearest.cl @@ -129,58 +129,53 @@ __kernel void knn_find_nearest(__global float* sample, int sample_row, int sampl } /*! find_nearest_neighbor done!*/ /*! write_results start!*/ - switch (regression) + if (regression) { - case true: - { - TYPE s; + TYPE s; #ifdef DOUBLE_SUPPORT - s = 0.0; + s = 0.0; #else - s = 0.0f; + s = 0.0f; #endif - for(j = 0; j < K1; j++) - s += nr[j * nThreads + threadY]; + for(j = 0; j < K1; j++) + s += nr[j * nThreads + threadY]; - _results[y * _results_step] = (float)(s * inv_scale); - } - break; - case false: - { - int prev_start = 0, best_count = 0, cur_count; - float best_val; + _results[y * _results_step] = (float)(s * inv_scale); + } + else + { + int prev_start = 0, best_count = 0, cur_count; + float best_val; - for(j = K1 - 1; j > 0; j--) + for(j = K1 - 1; j > 0; j--) + { + bool swap_f1 = false; + for(j1 = 0; j1 < j; j1++) { - bool swap_f1 = false; - for(j1 = 0; j1 < j; j1++) + if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY]) { - if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY]) - { - int t; - CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t); - swap_f1 = true; - } + int t; + CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t); + swap_f1 = true; } - if(!swap_f1) - break; } + if(!swap_f1) + break; + } - best_val = 0; - for(j = 1; j <= K1; j++) - if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY]) + best_val = 0; + for(j = 1; j <= K1; j++) + if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY]) + { + cur_count = j - prev_start; + if(best_count < cur_count) { - cur_count = j - prev_start; - if(best_count < cur_count) - { - best_count = cur_count; - best_val = nr[(j - 1) * nThreads + threadY]; - } - prev_start = j; + best_count = cur_count; + best_val = nr[(j - 1) * nThreads + threadY]; } - _results[y * _results_step] = best_val; - } - break; + prev_start = j; + } + _results[y * _results_step] = best_val; } ///*! write_results done!*/ } diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 6fc4c748cf..8b63c3bd2d 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -43,8 +43,6 @@ // //M*/ -#pragma OPENCL EXTENSION cl_amd_printf : enable - #if defined (DOUBLE_SUPPORT) #ifdef cl_khr_fp64 @@ -70,7 +68,7 @@ #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) // normAcc* are accurate normalization routines which make GPU matchTemplate // consistent with CPU one -float normAcc(float num, float denum) +inline float normAcc(float num, float denum) { if(fabs(num) < denum) { @@ -83,7 +81,7 @@ float normAcc(float num, float denum) return 0; } -float normAcc_SQDIFF(float num, float denum) +inline float normAcc_SQDIFF(float num, float denum) { if(fabs(num) < denum) { diff --git a/modules/ocl/src/opencl/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl index 728e2f9695..ea5060e467 100644 --- a/modules/ocl/src/opencl/meanShift.cl +++ b/modules/ocl/src/opencl/meanShift.cl @@ -46,7 +46,7 @@ // //M*/ -short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, +static short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, __global uchar4* in, int in_step, int dst_off, int src_off, int cols, int rows, int sp, int sr, int maxIter, float eps) { @@ -56,7 +56,6 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, src_off = src_off >> 2; dst_off = dst_off >> 2; int idx = src_off + y0 * in_step + x0; -// uchar4 c = vload4(0, (__global uchar*)in+idx); uchar4 c = in[idx]; int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ; diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index 602ebd1c1d..31c4c85ec7 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -162,7 +162,6 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s WT4 x3 = (WT4)(0.f); __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256; - bool switchFlag = false; WT4 p; WT4 x; @@ -173,7 +172,7 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s if(dy < src_rows) { - if((x_rest > 0) && (gidx == (get_num_groups(0) - 1))) + if((x_rest > 0) && (gidx == ((int)get_num_groups(0) - 1))) { int i; for(i = 0; i < x_rest - 4; i += 4) @@ -190,11 +189,8 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s } x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3; - x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3; - x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; - x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; WT x0_ = 0; @@ -238,11 +234,8 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s } x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3; - x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3; - x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; - x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; } @@ -251,7 +244,7 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s } __local WT mom[10][256]; - if((y_rest > 0) && (gidy == (get_num_groups(1) - 1))) + if((y_rest > 0) && (gidy == ((int)get_num_groups(1) - 1))) { if(ly < y_rest) { @@ -268,13 +261,10 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s } barrier(CLK_LOCAL_MEM_FENCE); if(ly < 10) - { for(int i = 1; i < y_rest; i++) - { mom[ly][0] = mom[ly][i] + mom[ly][0]; - } - } - }else + } + else { mom[9][ly] = py * sy; mom[8][ly] = x1.s0 * sy; @@ -413,11 +403,9 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s if(binary) { - WT s = 1./255; + WT s = 1.0f/255; if(ly < 10) - { mom[ly][0] *= s; - } barrier(CLK_LOCAL_MEM_FENCE); } WT xm = (gidx * 256) * mom[0][0]; @@ -440,7 +428,5 @@ __kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int s barrier(CLK_LOCAL_MEM_FENCE); if(ly < 10) - { dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1]; - } } diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 685eccf688..0d2f26f966 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -200,7 +200,7 @@ __kernel void normalize_hists_36_kernel(__global float* block_hists, //------------------------------------------------------------- // Normalization of histograms via L2Hys_norm // -float reduce_smem(volatile __local float* smem, int size) +static float reduce_smem(volatile __local float* smem, int size) { unsigned int tid = get_local_id(0); float sum = smem[tid]; @@ -564,7 +564,6 @@ __kernel void compute_gradients_8UC4_kernel( const int x = get_global_id(0); const int tid = get_local_id(0); const int gSizeX = get_local_size(0); - const int gidX = get_group_id(0); const int gidY = get_group_id(1); __global const uchar4* row = img + gidY * img_step; @@ -667,7 +666,6 @@ __kernel void compute_gradients_8UC1_kernel( const int x = get_global_id(0); const int tid = get_local_id(0); const int gSizeX = get_local_size(0); - const int gidX = get_group_id(0); const int gidY = get_group_id(1); __global const uchar* row = img + gidY * img_step; diff --git a/modules/ocl/src/opencl/optical_flow_farneback.cl b/modules/ocl/src/opencl/optical_flow_farneback.cl index 917f7f215d..4725662c60 100644 --- a/modules/ocl/src/opencl/optical_flow_farneback.cl +++ b/modules/ocl/src/opencl/optical_flow_farneback.cl @@ -44,10 +44,10 @@ //M*/ -#define tx get_local_id(0) +#define tx (int)get_local_id(0) #define ty get_local_id(1) #define bx get_group_id(0) -#define bdx get_local_size(0) +#define bdx (int)get_local_size(0) #define BORDER_SIZE 5 #define MAX_KSIZE_HALF 100 diff --git a/modules/ocl/src/opencl/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl index e09846457c..6f10067e9f 100644 --- a/modules/ocl/src/opencl/pyr_down.cl +++ b/modules/ocl/src/opencl/pyr_down.cl @@ -43,32 +43,32 @@ // //M*/ -int idx_row_low(int y, int last_row) +inline int idx_row_low(int y, int last_row) { return abs(y) % (last_row + 1); } -int idx_row_high(int y, int last_row) +inline int idx_row_high(int y, int last_row) { return abs(last_row - (int)abs(last_row - y)) % (last_row + 1); } -int idx_row(int y, int last_row) +inline int idx_row(int y, int last_row) { return idx_row_low(idx_row_high(y, last_row), last_row); } -int idx_col_low(int x, int last_col) +inline int idx_col_low(int x, int last_col) { return abs(x) % (last_col + 1); } -int idx_col_high(int x, int last_col) +inline int idx_col_high(int x, int last_col) { return abs(last_col - (int)abs(last_col - x)) % (last_col + 1); } -int idx_col(int x, int last_col) +inline int idx_col(int x, int last_col) { return idx_col_low(idx_col_high(x, last_col), last_col); } diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index 85f4d39343..a7fc27838b 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -53,7 +53,8 @@ #define WAVE_SIZE 1 #endif #ifdef CPU -void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) + +static void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -72,7 +73,7 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local } } -void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +static void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -89,7 +90,7 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l } } -void reduce1(float val1, volatile __local float* smem1, int tid) +static void reduce1(float val1, volatile __local float* smem1, int tid) { smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); @@ -104,7 +105,7 @@ void reduce1(float val1, volatile __local float* smem1, int tid) } } #else -void reduce3(float val1, float val2, float val3, +static void reduce3(float val1, float val2, float val3, __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) { smem1[tid] = val1; @@ -151,7 +152,7 @@ void reduce3(float val1, float val2, float val3, barrier(CLK_LOCAL_MEM_FENCE); } -void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) +static void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -190,7 +191,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola barrier(CLK_LOCAL_MEM_FENCE); } -void reduce1(float val1, __local volatile float* smem1, int tid) +static void reduce1(float val1, __local volatile float* smem1, int tid) { smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); @@ -226,7 +227,7 @@ void reduce1(float val1, __local volatile float* smem1, int tid) // Image read mode __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; -void SetPatch(image2d_t I, float x, float y, +static void SetPatch(image2d_t I, float x, float y, float* Pch, float* Dx, float* Dy, float* A11, float* A12, float* A22) { @@ -247,7 +248,7 @@ void SetPatch(image2d_t I, float x, float y, *A22 += dIdy * dIdy; } -void GetPatch(image2d_t J, float x, float y, +inline void GetPatch(image2d_t J, float x, float y, float* Pch, float* Dx, float* Dy, float* b1, float* b2) { @@ -257,13 +258,13 @@ void GetPatch(image2d_t J, float x, float y, *b2 += diff**Dy; } -void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) +inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) { float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; *errval += fabs(diff); } -void SetPatch4(image2d_t I, const float x, const float y, +static void SetPatch4(image2d_t I, const float x, const float y, float4* Pch, float4* Dx, float4* Dy, float* A11, float* A12, float* A22) { @@ -286,7 +287,7 @@ void SetPatch4(image2d_t I, const float x, const float y, *A22 += sqIdx.x + sqIdx.y + sqIdx.z; } -void GetPatch4(image2d_t J, const float x, const float y, +static void GetPatch4(image2d_t J, const float x, const float y, const float4* Pch, const float4* Dx, const float4* Dy, float* b1, float* b2) { @@ -298,7 +299,7 @@ void GetPatch4(image2d_t J, const float x, const float y, *b2 += xdiff.x + xdiff.y + xdiff.z; } -void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) +static void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) { float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); @@ -318,7 +319,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, unsigned int gid=get_group_id(0); unsigned int xsize=get_local_size(0); unsigned int ysize=get_local_size(1); - int xBase, yBase, i, j, k; + int xBase, yBase, k; float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); @@ -597,7 +598,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, unsigned int gid=get_group_id(0); unsigned int xsize=get_local_size(0); unsigned int ysize=get_local_size(1); - int xBase, yBase, i, j, k; + int xBase, yBase, k; float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); diff --git a/modules/ocl/src/opencl/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl index 7e1b15c994..b9aa048b07 100644 --- a/modules/ocl/src/opencl/split_mat.cl +++ b/modules/ocl/src/opencl/split_mat.cl @@ -183,7 +183,7 @@ __kernel void split_vector( int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \ int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \ int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \ - if (!BYPASS_VSTORE && dst ## xOffsetBytes + sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \ + if (!BYPASS_VSTORE && dst ## xOffsetBytes + (int)sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \ { \ VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \ } \ @@ -192,7 +192,7 @@ __kernel void split_vector( VEC_TO_ARRAY(vecValue, vecValue##Array); \ for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \ { \ - if (dst ## xOffsetBytes + sizeof(TYPE) <= dst ## xOffsetLimitBytes) \ + if (dst ## xOffsetBytes + (int)sizeof(TYPE) <= dst ## xOffsetLimitBytes) \ *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \ else \ break; \ diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 773aee618f..207bf0047f 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -56,7 +56,7 @@ #define radius 64 #endif -unsigned int CalcSSD(__local unsigned int *col_ssd) +static unsigned int CalcSSD(__local unsigned int *col_ssd) { unsigned int cache = col_ssd[0]; @@ -67,7 +67,7 @@ unsigned int CalcSSD(__local unsigned int *col_ssd) return cache; } -uint2 MinSSD(__local unsigned int *col_ssd) +static uint2 MinSSD(__local unsigned int *col_ssd) { unsigned int ssd[N_DISPARITIES]; const int win_size = (radius << 1); @@ -95,7 +95,7 @@ uint2 MinSSD(__local unsigned int *col_ssd) return (uint2)(mssd, bestIdx); } -void StepDown(int idx1, int idx2, __global unsigned char* imageL, +static void StepDown(int idx1, int idx2, __global unsigned char* imageL, __global unsigned char* imageR, int d, __local unsigned int *col_ssd) { uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7))); @@ -114,7 +114,7 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL, col_ssd[7 * (BLOCK_W + win_size)] += res.s0; } -void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, +static void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, __global unsigned char* imageR, int d, __local unsigned int *col_ssd) { @@ -153,7 +153,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; -#define Y (get_group_id(1) * ROWSperTHREAD + radius) +#define Y (int)(get_group_id(1) * ROWSperTHREAD + radius) __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; @@ -241,7 +241,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned /////////////////////////////////// Textureness filtering //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// -float sobel(__global unsigned char *input, int x, int y, int rows, int cols) +static float sobel(__global unsigned char *input, int x, int y, int rows, int cols) { float conv = 0; int y1 = y==0? 0 : y-1; @@ -256,7 +256,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols) return fabs(conv); } -float CalcSums(__local float *cols, __local float *cols_cache, int winsz) +static float CalcSums(__local float *cols, __local float *cols_cache, int winsz) { unsigned int cache = cols[0]; diff --git a/modules/ocl/src/opencl/stereobp.cl b/modules/ocl/src/opencl/stereobp.cl index 4818399c57..ec02f827a9 100644 --- a/modules/ocl/src/opencl/stereobp.cl +++ b/modules/ocl/src/opencl/stereobp.cl @@ -65,7 +65,7 @@ /////////////////////////////////////////////////////////////// /////////////////common/////////////////////////////////////// ///////////////////////////////////////////////////////////// -T saturate_cast(float v){ +inline T saturate_cast(float v){ #ifdef T_SHORT return convert_short_sat_rte(v); #else @@ -73,7 +73,7 @@ T saturate_cast(float v){ #endif } -T4 saturate_cast4(float4 v){ +inline T4 saturate_cast4(float4 v){ #ifdef T_SHORT return convert_short4_sat_rte(v); #else @@ -99,7 +99,7 @@ inline float pix_diff_1(const uchar4 l, __global const uchar *rs) return abs((int)(l.x) - *rs); } -float pix_diff_4(const uchar4 l, __global const uchar *rs) +static float pix_diff_4(const uchar4 l, __global const uchar *rs) { uchar4 r; r = *((__global uchar4 *)rs); @@ -235,7 +235,7 @@ __kernel void level_up_message(__global T *src, int src_rows, int src_step, /////////////////////////////////////////////////////////////// //////////////////// calc all iterations ///////////////////// /////////////////////////////////////////////////////////////// -void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_, +static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_, const __global T *dt, int u_step, int msg_disp_step, int data_disp_step, float4 cmax_disc_term, float4 cdisc_single_jump) diff --git a/modules/ocl/src/opencl/stereocsbp.cl b/modules/ocl/src/opencl/stereocsbp.cl index 50aabaca68..13a201cc1c 100644 --- a/modules/ocl/src/opencl/stereocsbp.cl +++ b/modules/ocl/src/opencl/stereocsbp.cl @@ -248,7 +248,7 @@ __kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, _ /////////////////////////////////////////////////////////////// /////////////////////// init data cost //////////////////////// /////////////////////////////////////////////////////////////// -float compute_3(__global uchar* left, __global uchar* right, +inline float compute_3(__global uchar* left, __global uchar* right, float cdata_weight, float cmax_data_term) { float tb = 0.114f * abs((int)left[0] - right[0]); @@ -257,17 +257,21 @@ float compute_3(__global uchar* left, __global uchar* right, return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); } -float compute_1(__global uchar* left, __global uchar* right, +inline float compute_1(__global uchar* left, __global uchar* right, float cdata_weight, float cmax_data_term) { return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term); } -short round_short(float v){ + +inline short round_short(float v) +{ return convert_short_sat_rte(v); } + /////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////init_data_cost/////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////// + __kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, int h, int w, int level, int channels, int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, @@ -993,7 +997,8 @@ __kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr /////////////////////////////////////////////////////////////// //////////////////////// init message ///////////////////////// /////////////////////////////////////////////////////////////// -void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new, + +static void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new, __global short *r_new, __global const short *u_cur, __global const short *d_cur, __global const short *l_cur, __global const short *r_cur, __global short *data_cost_selected, __global short *disparity_selected_new, @@ -1027,7 +1032,8 @@ void get_first_k_element_increase_0(__global short* u_new, __global short *d_new data_cost_new[id * cdisp_step1] = SHRT_MAX; } } -void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new, + +static void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new, __global float *r_new, __global const float *u_cur, __global const float *d_cur, __global const float *l_cur, __global const float *r_cur, __global float *data_cost_selected, __global float *disparity_selected_new, @@ -1190,7 +1196,8 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g /////////////////////////////////////////////////////////////// //////////////////// calc all iterations ///////////////////// /////////////////////////////////////////////////////////////// -void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1, + +static void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1, __global const short *msg2, __global const short *msg3, __global const short *dst_disp, __global const short *src_disp, int nr_plane, __global short *temp, @@ -1226,7 +1233,8 @@ void message_per_pixel_0(__global const short *data, __global short *msg_dst, __ for(int d = 0; d < nr_plane; d++) msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum); } -void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1, + +static void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1, __global const float *msg2, __global const float *msg3, __global const float *dst_disp, __global const float *src_disp, int nr_plane, __global float *temp, @@ -1262,6 +1270,7 @@ void message_per_pixel_1(__global const float *data, __global float *msg_dst, __ for(int d = 0; d < nr_plane; d++) msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum; } + __kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_, __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur, __global short *ctemp, int h, int w, int nr_plane, int i, @@ -1293,6 +1302,7 @@ __kernel void compute_message_0(__global short *u_, __global short *d_, __global cmax_disc_term, cdisp_step1, cdisc_single_jump); } } + __kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_, __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur, __global float *ctemp, int h, int w, int nr_plane, int i, @@ -1327,6 +1337,7 @@ __kernel void compute_message_1(__global float *u_, __global float *d_, __global /////////////////////////////////////////////////////////////// /////////////////////////// output //////////////////////////// /////////////////////////////////////////////////////////////// + __kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_, __global const short *r_, __global const short * data_cost_selected, __global const short *disp_selected_pyr, @@ -1364,6 +1375,7 @@ __kernel void compute_disp_0(__global const short *u_, __global const short *d_, disp[res_step * y + x] = best; } } + __kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_, __global const float *r_, __global const float *data_cost_selected, __global const float *disp_selected_pyr,