optimize stereoBP kernel

pull/1084/head
yao 12 years ago
parent 6bf8f474fa
commit f2333e90f6
  1. 201
      modules/ocl/src/opencl/stereobp.cl
  2. 6
      modules/ocl/src/stereobp.cpp

@ -56,8 +56,10 @@
#ifdef T_FLOAT #ifdef T_FLOAT
#define T float #define T float
#define T4 float4
#else #else
#define T short #define T short
#define T4 short4
#endif #endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -71,6 +73,14 @@ T saturate_cast(float v){
#endif #endif
} }
T4 saturate_cast4(float4 v){
#ifdef T_SHORT
return convert_short4_sat_rte(v);
#else
return v;
#endif
}
#define FLOAT_MAX 3.402823466e+38f #define FLOAT_MAX 3.402823466e+38f
typedef struct typedef struct
{ {
@ -84,29 +94,14 @@ typedef struct
////////////////////////// comp data ////////////////////////// ////////////////////////// comp data //////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
float pix_diff_1(__global const uchar *ls, __global const uchar *rs) inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
{ {
return abs((int)(*ls) - *rs); return abs((int)(l.x) - *rs);
} }
float pix_diff_3(__global const uchar *ls, __global const uchar *rs) float pix_diff_4(const uchar4 l, __global const uchar *rs)
{ {
const float tr = 0.299f; uchar4 r;
const float tg = 0.587f;
const float tb = 0.114f;
float val;
val = tb * abs((int)ls[0] - rs[0]);
val += tg * abs((int)ls[1] - rs[1]);
val += tr * abs((int)ls[2] - rs[2]);
return val;
}
float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
{
uchar4 l, r;
l = *((__global uchar4 *)ls);
r = *((__global uchar4 *)rs); r = *((__global uchar4 *)rs);
const float tr = 0.299f; const float tr = 0.299f;
@ -122,11 +117,19 @@ float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
return val; return val;
} }
inline float pix_diff_3(const uchar4 l, __global const uchar *rs)
{
return pix_diff_4(l, rs);
}
#ifndef CN #ifndef CN
#define CN 4 #define CN 4
#endif #endif
#ifndef CNDISP
#define CNDISP 64
#endif
#define CAT(X,Y) X##Y #define CAT(X,Y) X##Y
#define CAT2(X,Y) CAT(X,Y) #define CAT2(X,Y) CAT(X,Y)
@ -149,19 +152,20 @@ __kernel void comp_data(__global uchar *left, int left_rows, int left_cols, i
__global T *ds = data + y * data_step + x; __global T *ds = data + y * data_step + x;
const unsigned int disp_step = data_step * left_rows; const unsigned int disp_step = data_step * left_rows;
const float weightXterm = con_st -> cdata_weight * con_st -> cmax_data_term;
const uchar4 ls_data = vload4(0, ls);
for (int disp = 0; disp < con_st -> cndisp; disp++) for (int disp = 0; disp < con_st -> cndisp; disp++)
{ {
if (x - disp >= 1) if (x - disp >= 1)
{ {
float val = 0; float val = 0;
val = PIX_DIFF(ls, rs - disp * CN); val = PIX_DIFF(ls_data, rs - disp * CN);
ds[disp * disp_step] = saturate_cast(fmin(con_st -> cdata_weight * val, ds[disp * disp_step] = saturate_cast(fmin(con_st -> cdata_weight * val, weightXterm));
con_st -> cdata_weight * con_st -> cmax_data_term));
} }
else else
{ {
ds[disp * disp_step] = saturate_cast(con_st -> cdata_weight * con_st -> cmax_data_term); ds[disp * disp_step] = saturate_cast(weightXterm);
} }
} }
} }
@ -182,13 +186,20 @@ __kernel void data_step_down(__global T *src, int src_rows,
{ {
src_step /= sizeof(T); src_step /= sizeof(T);
dst_step /= sizeof(T); dst_step /= sizeof(T);
int4 coor_step = (int4)(src_rows * src_step);
int4 coor = (int4)(min(2*y+0, src_rows-1) * src_step + 2*x+0,
min(2*y+1, src_rows-1) * src_step + 2*x+0,
min(2*y+0, src_rows-1) * src_step + 2*x+1,
min(2*y+1, src_rows-1) * src_step + 2*x+1);
for (int d = 0; d < cndisp; ++d) for (int d = 0; d < cndisp; ++d)
{ {
float dst_reg; float dst_reg;
dst_reg = src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+0]; dst_reg = src[coor.x];
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+0]; dst_reg += src[coor.y];
dst_reg += src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+1]; dst_reg += src[coor.z];
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+1]; dst_reg += src[coor.w];
coor += coor_step;
dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg); dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
} }
@ -224,85 +235,95 @@ __kernel void level_up_message(__global T *src, int src_rows, int src_step,
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void calc_min_linear_penalty(__global T * dst, int disp_step, void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
int cndisp, float cdisc_single_jump) const __global T *dt,
int u_step, int msg_disp_step, int data_disp_step,
float4 cmax_disc_term, float4 cdisc_single_jump)
{ {
float prev = dst[0]; __global T *us = us_ + u_step;
float cur; __global T *ds = ds_ - u_step;
__global T *ls = ls_ + 1;
__global T *rs = rs_ - 1;
for (int disp = 1; disp < cndisp; ++disp) float4 minimum = (float4)(FLOAT_MAX);
{
prev += cdisc_single_jump;
cur = dst[disp_step * disp];
if (prev < cur) T4 t_dst[CNDISP];
{ float4 dst_reg;
cur = prev; float4 prev;
dst[disp_step * disp] = saturate_cast(prev); float4 cur;
}
prev = cur; T t_us = us[0];
} T t_ds = ds[0];
T t_ls = ls[0];
T t_rs = rs[0];
T t_dt = dt[0];
prev = (float4)(t_us + t_ls + t_rs + t_dt,
t_ds + t_ls + t_rs + t_dt,
t_us + t_ds + t_rs + t_dt,
t_us + t_ds + t_ls + t_dt);
minimum = min(prev, minimum);
t_dst[0] = saturate_cast4(prev);
prev = dst[(cndisp - 1) * disp_step]; for(int i = 1, idx = msg_disp_step; i < CNDISP; ++i, idx+=msg_disp_step)
for (int disp = cndisp - 2; disp >= 0; disp--)
{ {
t_us = us[idx];
t_ds = ds[idx];
t_ls = ls[idx];
t_rs = rs[idx];
t_dt = dt[data_disp_step * i];
dst_reg = (float4)(t_us + t_ls + t_rs + t_dt,
t_ds + t_ls + t_rs + t_dt,
t_us + t_ds + t_rs + t_dt,
t_us + t_ds + t_ls + t_dt);
minimum = min(dst_reg, minimum);
prev += cdisc_single_jump; prev += cdisc_single_jump;
cur = dst[disp_step * disp]; prev = min(prev, dst_reg);
if (prev < cur) t_dst[i] = saturate_cast4(prev);
{
cur = prev;
dst[disp_step * disp] = saturate_cast(prev);
}
prev = cur;
} }
}
void message(const __global T *msg1, const __global T *msg2,
const __global T *msg3, const __global T *data, __global T *dst,
int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
{
float minimum = FLOAT_MAX;
for(int i = 0; i < cndisp; ++i) minimum += cmax_disc_term;
float4 sum = 0;
prev = convert_float4(t_dst[CNDISP - 1]);
for (int disp = CNDISP - 2; disp >= 0; disp--)
{ {
float dst_reg; prev += cdisc_single_jump;
dst_reg = msg1[msg_disp_step * i]; cur = convert_float4(t_dst[disp]);
dst_reg += msg2[msg_disp_step * i]; prev = min(prev, cur);
dst_reg += msg3[msg_disp_step * i]; cur = min(prev, minimum);
dst_reg += data[data_disp_step * i]; sum += cur;
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = saturate_cast(dst_reg); t_dst[disp] = saturate_cast4(cur);
} }
calc_min_linear_penalty(dst, msg_disp_step, cndisp, cdisc_single_jump); dst_reg = convert_float4(t_dst[CNDISP - 1]);
dst_reg = min(dst_reg, minimum);
t_dst[CNDISP - 1] = saturate_cast4(dst_reg);
sum += dst_reg;
minimum += cmax_disc_term; sum /= CNDISP;
#pragma unroll
float sum = 0; for(int i = 0, idx = 0; i < CNDISP; ++i, idx+=msg_disp_step)
for(int i = 0; i < cndisp; ++i)
{ {
float dst_reg = dst[msg_disp_step * i]; T4 dst = t_dst[i];
if (dst_reg > minimum) us_[idx] = dst.x - sum.x;
{ ds_[idx] = dst.y - sum.y;
dst_reg = minimum; rs_[idx] = dst.z - sum.z;
dst[msg_disp_step * i] = saturate_cast(minimum); ls_[idx] = dst.w - sum.w;
}
sum += dst_reg;
} }
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
} }
__kernel void one_iteration(__global T *u, int u_step, __kernel void one_iteration(__global T *u, int u_step,
__global T *data, int data_step, __global T *data, int data_step,
__global T *d, __global T *l, __global T *r, __global T *d, __global T *l, __global T *r,
int t, int cols, int rows, int t, int cols, int rows,
int cndisp, float cmax_disc_term, float cdisc_single_jump) float cmax_disc_term, float cdisc_single_jump)
{ {
const int y = get_global_id(1); const int y = get_global_id(1);
const int x = ((get_global_id(0)) << 1) + ((y + t) & 1); const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
@ -321,15 +342,9 @@ __kernel void one_iteration(__global T *u, int u_step,
int msg_disp_step = u_step * rows; int msg_disp_step = u_step * rows;
int data_disp_step = data_step * rows; int data_disp_step = data_step * rows;
message(us + u_step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp, message(us, ds, ls, rs, dt,
cmax_disc_term, cdisc_single_jump); u_step, msg_disp_step, data_disp_step,
message(ds - u_step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp, (float4)(cmax_disc_term), (float4)(cdisc_single_jump));
cmax_disc_term, cdisc_single_jump);
message(us + u_step, ds - u_step, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message(us + u_step, ds - u_step, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
} }
} }

@ -236,13 +236,13 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int) , (void *)&t)); args.push_back( make_pair( sizeof(cl_int) , (void *)&t));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cols)); args.push_back( make_pair( sizeof(cl_int) , (void *)&cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cndisp));
args.push_back( make_pair( sizeof(cl_float) , (void *)&cmax_disc_term)); args.push_back( make_pair( sizeof(cl_float) , (void *)&cmax_disc_term));
args.push_back( make_pair( sizeof(cl_float) , (void *)&cdisc_single_jump)); args.push_back( make_pair( sizeof(cl_float) , (void *)&cdisc_single_jump));
size_t gt[3] = {cols, rows, 1}, lt[3] = {16, 16, 1}; size_t gt[3] = {cols, rows, 1}, lt[3] = {16, 16, 1};
const char* t_opt = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT"; char opt[80] = "";
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt); sprintf(opt, "-D %s -D CNDISP=%d", data_type == CV_16S ? "T_SHORT":"T_FLOAT", cndisp);
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, opt);
} }
static void calc_all_iterations_calls(int cols, int rows, int iters, oclMat &u, static void calc_all_iterations_calls(int cols, int rows, int iters, oclMat &u,

Loading…
Cancel
Save