From af33c118b41136bbecaa4df3cc83e1c0f529ca3d Mon Sep 17 00:00:00 2001 From: perping Date: Fri, 1 Nov 2013 14:07:10 +0800 Subject: [PATCH 1/2] fixed a bug of haar. --- modules/ocl/src/haar.cpp | 6 +-- modules/ocl/src/opencl/haarobjectdetect.cl | 50 +++++++++++++++---- .../src/opencl/haarobjectdetect_scaled2.cl | 31 +++++++++--- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 1ef0e95482..95b934750a 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -1676,9 +1676,9 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs( { sz = sizev[i]; factor = scalev[i]; - int ystep = cvRound(std::max(2., factor)); - int width = (cols - 1 - sz.width + ystep - 1) / ystep; - int height = (rows - 1 - sz.height + ystep - 1) / ystep; + double ystep = cv::max(2.,factor); + int width = cvRound((cols - 1 - sz.width + ystep - 1) / ystep); + int height = cvRound((rows - 1 - sz.height + ystep - 1) / ystep); int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 1d53f2b880..e74256f527 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -11,6 +11,7 @@ // Jia Haipeng, jiahaipeng95@gmail.com // Nathan, liujun@multicorewareinc.com // Peng Xiao, pengxiao@outlook.com +// Erping Pang, erping@multicorewareinc.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -321,7 +322,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int glb_x = grpoffx + (lcl_x<<2); int glb_y = grpoffy + lcl_y; - int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x); + int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x); int4 data = *(__global int4*)&sum[glb_off]; int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); @@ -421,12 +422,25 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa result = (stage_sum >= stagethreshold); } - - if(result && (x < width) && (y < height)) + if(factor < 2) { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + if(result && lclidx %2 ==0 && lclidy %2 ==0 ) + { + + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; + lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor); + } + } + else + { + if(result) + { + + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; + lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor); + } } barrier(CLK_LOCAL_MEM_FENCE); int queuecount = lclcount[0]; @@ -549,11 +563,27 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16)); temp = glboutindex[0]; int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor*20.f); - candidate_result.x = convert_int_rtn(x*factor); - candidate_result.y = convert_int_rtn(y*factor); + candidate_result.zw = (int2)convert_int_rtn(round(factor*20.f)); + candidate_result.x = convert_int_rtn(round(x*factor)); + candidate_result.y = convert_int_rtn(round(y*factor)); atomic_inc(glboutindex); - candidate[outputoff+temp+lcl_id] = candidate_result; + + int i = outputoff+temp+lcl_id; + if(candidate[i].z == 0) + { + candidate[i] = candidate_result; + } + else + { + for(i=i+1;;i++) + { + if(candidate[i].z == 0) + { + candidate[i] = candidate_result; + break; + } + } + } } barrier(CLK_LOCAL_MEM_FENCE); }//end for(int grploop=grpidx;grploop> 16; temp = atomic_inc(glboutindex); int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); + candidate_result.zw = (int2)convert_int_rtn(round(factor * 20.f)); candidate_result.x = x; candidate_result.y = y; - candidate[outputoff + temp + lcl_id] = candidate_result; + + int i = outputoff+temp+lcl_id; + if(candidate[i].z == 0) + { + candidate[i] = candidate_result; + } + else + { + for(i=i+1;;i++) + { + if(candidate[i].z == 0) + { + candidate[i] = candidate_result; + break; + } + } + } } barrier(CLK_LOCAL_MEM_FENCE); @@ -284,7 +301,7 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); } - t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); + t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]); counter += nodenum; #pragma unroll From af77111cd6b0a30cc43e6502f0e8ccb5af689cef Mon Sep 17 00:00:00 2001 From: perping Date: Fri, 1 Nov 2013 17:53:35 +0800 Subject: [PATCH 2/2] remove whitespace. --- modules/ocl/src/haar.cpp | 10 +++++----- modules/ocl/src/opencl/haarobjectdetect.cl | 14 ++++++-------- modules/ocl/src/opencl/haarobjectdetect_scaled2.cl | 8 ++++---- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 95b934750a..31f6742811 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -1059,11 +1059,11 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS { sz = sizev[i]; factor = scalev[i]; - int ystep = cvRound(std::max(2., factor)); - int equRect_x = (int)(factor * gcascade->p0 + 0.5); - int equRect_y = (int)(factor * gcascade->p1 + 0.5); - int equRect_w = (int)(factor * gcascade->p3 + 0.5); - int equRect_h = (int)(factor * gcascade->p2 + 0.5); + double ystep = std::max(2., factor); + int equRect_x = cvRound(factor * gcascade->p0); + int equRect_y = cvRound(factor * gcascade->p1); + int equRect_w = cvRound(factor * gcascade->p3); + int equRect_h = cvRound(factor * gcascade->p2); p[i].s[0] = equRect_x; p[i].s[1] = equRect_y; p[i].s[2] = equRect_x + equRect_w; diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index e74256f527..bafd474725 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -426,7 +426,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa { if(result && lclidx %2 ==0 && lclidy %2 ==0 ) { - int queueindex = atomic_inc(lclcount); lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor); @@ -436,7 +435,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa { if(result) { - int queueindex = atomic_inc(lclcount); lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor); @@ -563,20 +561,20 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16)); temp = glboutindex[0]; int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(round(factor*20.f)); - candidate_result.x = convert_int_rtn(round(x*factor)); - candidate_result.y = convert_int_rtn(round(y*factor)); + candidate_result.zw = (int2)convert_int_rte(factor*20.f); + candidate_result.x = convert_int_rte(x*factor); + candidate_result.y = convert_int_rte(y*factor); atomic_inc(glboutindex); int i = outputoff+temp+lcl_id; if(candidate[i].z == 0) - { + { candidate[i] = candidate_result; } else - { + { for(i=i+1;;i++) - { + { if(candidate[i].z == 0) { candidate[i] = candidate_result; diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl index 9597dfe00f..a8faaf8421 100644 --- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl +++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl @@ -259,19 +259,19 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( int y = (temp & (int)0xffff0000) >> 16; temp = atomic_inc(glboutindex); int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(round(factor * 20.f)); + candidate_result.zw = (int2)convert_int_rte(factor * 20.f); candidate_result.x = x; candidate_result.y = y; int i = outputoff+temp+lcl_id; if(candidate[i].z == 0) - { + { candidate[i] = candidate_result; } else - { + { for(i=i+1;;i++) - { + { if(candidate[i].z == 0) { candidate[i] = candidate_result;