|
|
|
@ -46,145 +46,10 @@ |
|
|
|
|
|
|
|
|
|
//#pragma OPENCL EXTENSION cl_amd_printf : enable |
|
|
|
|
|
|
|
|
|
__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) |
|
|
|
|
{ |
|
|
|
|
const int x = get_global_id(0); |
|
|
|
|
const int y = get_global_id(1); |
|
|
|
|
|
|
|
|
|
if (y < rows && x < cols * cn) |
|
|
|
|
{ |
|
|
|
|
const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x]; |
|
|
|
|
const uchar src_val1 = (src + y * srcStep)[x]; |
|
|
|
|
const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x]; |
|
|
|
|
|
|
|
|
|
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; |
|
|
|
|
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) |
|
|
|
|
{ |
|
|
|
|
const int x = get_global_id(0); |
|
|
|
|
const int y = get_global_id(1); |
|
|
|
|
|
|
|
|
|
if (y < rows && x < cols * cn) |
|
|
|
|
{ |
|
|
|
|
const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x]; |
|
|
|
|
const uchar src_val1 = (src + y * srcStep)[x]; |
|
|
|
|
const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x]; |
|
|
|
|
|
|
|
|
|
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; |
|
|
|
|
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) |
|
|
|
|
{ |
|
|
|
|
const int x = get_global_id(0); |
|
|
|
|
const int y = get_global_id(1); |
|
|
|
|
|
|
|
|
|
const int colsn = cols * cn; |
|
|
|
|
|
|
|
|
|
if (y < rows && x < colsn) |
|
|
|
|
{ |
|
|
|
|
__global const short* dx_buf_row = dx_buf + y * dx_bufStep; |
|
|
|
|
__global const short* dy_buf_row = dy_buf + y * dy_bufStep; |
|
|
|
|
|
|
|
|
|
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; |
|
|
|
|
const int xl = x - cn >= 0 ? x - cn : cn + x; |
|
|
|
|
|
|
|
|
|
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; |
|
|
|
|
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) |
|
|
|
|
{ |
|
|
|
|
const int x = get_global_id(0); |
|
|
|
|
const int y = get_global_id(1); |
|
|
|
|
|
|
|
|
|
const int colsn = cols * cn; |
|
|
|
|
|
|
|
|
|
if (y < rows && x < colsn) |
|
|
|
|
{ |
|
|
|
|
__global const short* dx_buf_row = dx_buf + y * dx_bufStep; |
|
|
|
|
__global const short* dy_buf_row = dy_buf + y * dy_bufStep; |
|
|
|
|
|
|
|
|
|
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; |
|
|
|
|
const int xl = x - cn >= 0 ? x - cn : cn + x; |
|
|
|
|
|
|
|
|
|
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; |
|
|
|
|
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define W_BITS 14 |
|
|
|
|
#define W_BITS1 14 |
|
|
|
|
|
|
|
|
|
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) |
|
|
|
|
|
|
|
|
|
int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y) |
|
|
|
|
{ |
|
|
|
|
int2 ipt; |
|
|
|
|
ipt.x = convert_int_sat_rtn(pt.x); |
|
|
|
|
ipt.y = convert_int_sat_rtn(pt.y); |
|
|
|
|
|
|
|
|
|
float a = pt.x - ipt.x; |
|
|
|
|
float b = pt.y - ipt.y; |
|
|
|
|
|
|
|
|
|
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); |
|
|
|
|
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; |
|
|
|
|
|
|
|
|
|
__global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; |
|
|
|
|
__global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; |
|
|
|
|
|
|
|
|
|
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y) |
|
|
|
|
{ |
|
|
|
|
int2 ipt; |
|
|
|
|
ipt.x = convert_int_sat_rtn(pt.x); |
|
|
|
|
ipt.y = convert_int_sat_rtn(pt.y); |
|
|
|
|
|
|
|
|
|
float a = pt.x - ipt.x; |
|
|
|
|
float b = pt.y - ipt.y; |
|
|
|
|
|
|
|
|
|
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); |
|
|
|
|
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; |
|
|
|
|
|
|
|
|
|
__global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; |
|
|
|
|
__global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; |
|
|
|
|
|
|
|
|
|
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y) |
|
|
|
|
{ |
|
|
|
|
int2 ipt; |
|
|
|
|
ipt.x = convert_int_sat_rtn(pt.x); |
|
|
|
|
ipt.y = convert_int_sat_rtn(pt.y); |
|
|
|
|
|
|
|
|
|
float a = pt.x - ipt.x; |
|
|
|
|
float b = pt.y - ipt.y; |
|
|
|
|
|
|
|
|
|
float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
float iw01 = (a * (1.0f - b) * (1 << W_BITS)); |
|
|
|
|
float iw10 = ((1.0f - a) * b * (1 << W_BITS)); |
|
|
|
|
float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; |
|
|
|
|
|
|
|
|
|
__global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn; |
|
|
|
|
__global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn; |
|
|
|
|
|
|
|
|
|
return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define BUFFER 64 |
|
|
|
|
|
|
|
|
|
#ifndef WAVE_SIZE |
|
|
|
|
#define WAVE_SIZE 1 |
|
|
|
|
#endif |
|
|
|
|
#ifdef CPU |
|
|
|
|
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) |
|
|
|
|
{ |
|
|
|
@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local |
|
|
|
|
smem3[tid] = val3; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 128]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 128]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 64]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 64]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 32]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 32]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 32]; |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
smem2[tid] += smem2[tid + 32]; |
|
|
|
|
smem3[tid] += smem3[tid + 32]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 16]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 16]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 16]; |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
smem2[tid] += smem2[tid + 16]; |
|
|
|
|
smem3[tid] += smem3[tid + 16]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 8]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 8]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 8]; |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
smem2[tid] += smem2[tid + 8]; |
|
|
|
|
smem3[tid] += smem3[tid + 8]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 4]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 4]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 4]; |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
smem2[tid] += smem2[tid + 4]; |
|
|
|
|
smem3[tid] += smem3[tid + 4]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 2]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 2]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 2]; |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
smem2[tid] += smem2[tid + 2]; |
|
|
|
|
smem3[tid] += smem3[tid + 2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 1) |
|
|
|
|
{ |
|
|
|
|
smem1[BUFFER] = val1 += smem1[tid + 1]; |
|
|
|
|
smem2[BUFFER] = val2 += smem2[tid + 1]; |
|
|
|
|
smem3[BUFFER] = val3 += smem3[tid + 1]; |
|
|
|
|
smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; |
|
|
|
|
smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; |
|
|
|
|
smem3[BUFFER] = smem3[tid] + smem3[tid + 1]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l |
|
|
|
|
smem2[tid] = val2; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 128]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 128]); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 64]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 64]); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 32]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 32]); |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
smem2[tid] += smem2[tid + 32]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 16]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 16]); |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
smem2[tid] += smem2[tid + 16]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 8]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 8]); |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
smem2[tid] += smem2[tid + 8]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 4]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 4]); |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
smem2[tid] += smem2[tid + 4]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 2]); |
|
|
|
|
smem2[tid] = (val2 += smem2[tid + 2]); |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
smem2[tid] += smem2[tid + 2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 1) |
|
|
|
|
{ |
|
|
|
|
smem1[BUFFER] = (val1 += smem1[tid + 1]); |
|
|
|
|
smem2[BUFFER] = (val2 += smem2[tid + 1]); |
|
|
|
|
smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; |
|
|
|
|
smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid) |
|
|
|
|
smem1[tid] = val1; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 128]); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 64]); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 32]); |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 16]); |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 8]); |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 4]); |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = (val1 += smem1[tid + 2]); |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 1) |
|
|
|
|
{ |
|
|
|
|
smem1[BUFFER] = (val1 += smem1[tid + 1]); |
|
|
|
|
smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) |
|
|
|
|
void reduce3(float val1, float val2, float val3, |
|
|
|
|
__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1; |
|
|
|
|
smem2[tid] = val2; |
|
|
|
|
smem3[tid] = val3; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 128]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 128]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
smem2[tid] += smem2[tid + 32]; |
|
|
|
|
smem3[tid] += smem3[tid + 32]; |
|
|
|
|
#if WAVE_SIZE < 32 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 16) { |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 64]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 64]; |
|
|
|
|
smem3[tid] = val3 += smem3[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
smem2[tid] += smem2[tid + 16]; |
|
|
|
|
smem3[tid] += smem3[tid + 16]; |
|
|
|
|
#if WAVE_SIZE <16 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 8) { |
|
|
|
|
#endif |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
smem2[tid] += smem2[tid + 8]; |
|
|
|
|
smem3[tid] += smem3[tid + 8]; |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
volatile __local float* vmem1 = smem1; |
|
|
|
|
volatile __local float* vmem2 = smem2; |
|
|
|
|
volatile __local float* vmem3 = smem3; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 32]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 32]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 32]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 16]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 16]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 16]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 8]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 8]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 8]; |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
smem2[tid] += smem2[tid + 4]; |
|
|
|
|
smem3[tid] += smem3[tid + 4]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 4]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 4]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 4]; |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
smem2[tid] += smem2[tid + 2]; |
|
|
|
|
smem3[tid] += smem3[tid + 2]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 2]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 2]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 2]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 1]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 1]; |
|
|
|
|
vmem3[tid] = val3 += vmem3[tid + 1]; |
|
|
|
|
smem1[tid] += smem1[tid + 1]; |
|
|
|
|
smem2[tid] += smem2[tid + 1]; |
|
|
|
|
smem3[tid] += smem3[tid + 1]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) |
|
|
|
|
void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1; |
|
|
|
|
smem2[tid] = val2; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 128]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
smem2[tid] += smem2[tid + 32]; |
|
|
|
|
#if WAVE_SIZE < 32 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 16) { |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 64]; |
|
|
|
|
smem2[tid] = val2 += smem2[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
smem2[tid] += smem2[tid + 16]; |
|
|
|
|
#if WAVE_SIZE <16 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 8) { |
|
|
|
|
#endif |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
smem2[tid] += smem2[tid + 8]; |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
volatile __local float* vmem1 = smem1; |
|
|
|
|
volatile __local float* vmem2 = smem2; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 32]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 32]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 16]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 16]; |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
smem2[tid] += smem2[tid + 4]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 8]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 8]; |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
smem2[tid] += smem2[tid + 2]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 4]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 4]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 2]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 2]; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 1]; |
|
|
|
|
vmem2[tid] = val2 += vmem2[tid + 1]; |
|
|
|
|
smem1[tid] += smem1[tid + 1]; |
|
|
|
|
smem2[tid] += smem2[tid + 1]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void reduce1(float val1, __local float* smem1, int tid) |
|
|
|
|
void reduce1(float val1, __local volatile float* smem1, int tid) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
#if BUFFER > 128 |
|
|
|
|
if (tid < 128) |
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 32]; |
|
|
|
|
#if WAVE_SIZE < 32 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 16) { |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if BUFFER > 64 |
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem1[tid] = val1 += smem1[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
smem1[tid] += smem1[tid + 16]; |
|
|
|
|
#if WAVE_SIZE <16 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 8) { |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
volatile __local float* vmem1 = smem1; |
|
|
|
|
|
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 32]; |
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 16]; |
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 8]; |
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 4]; |
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 2]; |
|
|
|
|
vmem1[tid] = val1 += vmem1[tid + 1]; |
|
|
|
|
smem1[tid] += smem1[tid + 8]; |
|
|
|
|
smem1[tid] += smem1[tid + 4]; |
|
|
|
|
smem1[tid] += smem1[tid + 2]; |
|
|
|
|
smem1[tid] += smem1[tid + 1]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define SCALE (1.0f / (1 << 20)) |
|
|
|
|
#define THRESHOLD 0.01f |
|
|
|
|
#define DIMENSION 21 |
|
|
|
|
|
|
|
|
|
// Image read mode |
|
|
|
|
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; |
|
|
|
|