|
|
|
@ -52,6 +52,18 @@ |
|
|
|
|
|
|
|
|
|
#define noconvert |
|
|
|
|
|
|
|
|
|
#if cn != 3 |
|
|
|
|
#define loadpix(addr) *(__global const srcT *)(addr) |
|
|
|
|
#define storepix(val, addr) *(__global dstT *)(addr) = val |
|
|
|
|
#define srcTSIZE (int)sizeof(srcT) |
|
|
|
|
#define dstTSIZE (int)sizeof(dstT) |
|
|
|
|
#else |
|
|
|
|
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) |
|
|
|
|
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) |
|
|
|
|
#define srcTSIZE ((int)sizeof(srcT1)*3) |
|
|
|
|
#define dstTSIZE ((int)sizeof(dstT1)*3) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#ifdef HAVE_MASK |
|
|
|
|
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset |
|
|
|
|
#else |
|
|
|
@ -88,19 +100,20 @@ |
|
|
|
|
|
|
|
|
|
#ifdef HAVE_MASK |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
dstT temp = convertToDT(src[0]); \ |
|
|
|
|
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ |
|
|
|
|
if (mask[mask_index]) \ |
|
|
|
|
FUNC(accumulator, temp) |
|
|
|
|
{ \ |
|
|
|
|
dstT temp = convertToDT(loadpix(srcptr + src_index)); \ |
|
|
|
|
FUNC(accumulator, temp); \ |
|
|
|
|
} |
|
|
|
|
#elif defined OP_DOT |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \ |
|
|
|
|
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \ |
|
|
|
|
dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \ |
|
|
|
|
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \ |
|
|
|
|
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ |
|
|
|
|
FUNC(accumulator, temp, temp2) |
|
|
|
|
#else |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
dstT temp = convertToDT(src[0]); \ |
|
|
|
|
dstT temp = convertToDT(loadpix(srcptr + src_index)); \ |
|
|
|
|
FUNC(accumulator, temp) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
@ -111,8 +124,7 @@ |
|
|
|
|
#define REDUCE_LOCAL_2 \ |
|
|
|
|
localmem[lid] += localmem[lid2] |
|
|
|
|
#define CALC_RESULT \ |
|
|
|
|
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ |
|
|
|
|
dst[0] = localmem[0] |
|
|
|
|
storepix(localmem[0], dstptr + dstTSIZE * gid) |
|
|
|
|
|
|
|
|
|
// countNonZero stuff |
|
|
|
|
#elif defined OP_COUNT_NON_ZERO |
|
|
|
@ -123,7 +135,7 @@ |
|
|
|
|
dstT accumulator = (dstT)(0); \ |
|
|
|
|
srcT zero = (srcT)(0), one = (srcT)(1) |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
accumulator += src[0] == zero ? zero : one |
|
|
|
|
accumulator += loadpix(srcptr + src_index) == zero ? zero : one |
|
|
|
|
#define SET_LOCAL_1 \ |
|
|
|
|
localmem[lid] = accumulator |
|
|
|
|
#define REDUCE_LOCAL_1 \ |
|
|
|
@ -131,8 +143,7 @@ |
|
|
|
|
#define REDUCE_LOCAL_2 \ |
|
|
|
|
localmem[lid] += localmem[lid2] |
|
|
|
|
#define CALC_RESULT \ |
|
|
|
|
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ |
|
|
|
|
dst[0] = localmem[0] |
|
|
|
|
storepix(localmem[0], dstptr + dstTSIZE * gid) |
|
|
|
|
|
|
|
|
|
// minMaxLoc stuff |
|
|
|
|
#elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK |
|
|
|
@ -167,6 +178,8 @@ |
|
|
|
|
#define MAX_VAL DBL_MAX |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define dstT srcT |
|
|
|
|
|
|
|
|
|
#define DECLARE_LOCAL_MEM \ |
|
|
|
|
__local srcT localmem_min[WGS2_ALIGNED]; \ |
|
|
|
|
__local srcT localmem_max[WGS2_ALIGNED]; \ |
|
|
|
@ -181,7 +194,7 @@ |
|
|
|
|
srcT temp; \ |
|
|
|
|
int temploc |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
temp = src[0]; \ |
|
|
|
|
temp = loadpix(srcptr + src_index); \ |
|
|
|
|
temploc = id; \ |
|
|
|
|
srcT temp_minval = minval, temp_maxval = maxval; \ |
|
|
|
|
minval = min(minval, temp); \ |
|
|
|
@ -217,10 +230,8 @@ |
|
|
|
|
localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \ |
|
|
|
|
localmem_maxloc[lid2] : localmem_maxloc[lid] |
|
|
|
|
#define CALC_RESULT \ |
|
|
|
|
__global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \ |
|
|
|
|
__global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \ |
|
|
|
|
dstminval[0] = localmem_min[0]; \ |
|
|
|
|
dstmaxval[0] = localmem_max[0]; \ |
|
|
|
|
storepix(localmem_min[0], dstptr + dstTSIZE * gid); \ |
|
|
|
|
storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \ |
|
|
|
|
dstlocptr[gid] = localmem_minloc[0]; \ |
|
|
|
|
dstlocptr2[gid] = localmem_maxloc[0] |
|
|
|
|
|
|
|
|
@ -236,7 +247,7 @@ |
|
|
|
|
int temploc |
|
|
|
|
#undef REDUCE_GLOBAL |
|
|
|
|
#define REDUCE_GLOBAL \ |
|
|
|
|
temp = src[0]; \ |
|
|
|
|
temp = loadpix(srcptr + src_index); \ |
|
|
|
|
temploc = id; \ |
|
|
|
|
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \ |
|
|
|
|
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ |
|
|
|
@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset |
|
|
|
|
|
|
|
|
|
for (int grain = groupnum * WGS; id < total; id += grain) |
|
|
|
|
{ |
|
|
|
|
int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset)); |
|
|
|
|
__global const srcT * src = (__global const srcT *)(srcptr + src_index); |
|
|
|
|
int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset)); |
|
|
|
|
REDUCE_GLOBAL; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|