diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index bf911f4bea..c891eca452 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -111,6 +111,7 @@ namespace cv bool haveDoubleSupport; bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0 + bool isIntelDevice; std::string compilationExtraOptions; @@ -157,7 +158,8 @@ namespace cv { FEATURE_CL_DOUBLE = 1, FEATURE_CL_UNIFIED_MEM, - FEATURE_CL_VER_1_2 + FEATURE_CL_VER_1_2, + FEATURE_CL_INTEL_DEVICE }; // Represents OpenCL context, interface diff --git a/modules/ocl/src/cl_context.cpp b/modules/ocl/src/cl_context.cpp index 258ed91e51..fab67c5a65 100644 --- a/modules/ocl/src/cl_context.cpp +++ b/modules/ocl/src/cl_context.cpp @@ -448,6 +448,17 @@ static int initializeOpenCLDevices() { deviceInfo.info.haveDoubleSupport = false; } + + size_t intel_platform = platformInfo.info.platformVendor.find("Intel"); + if(intel_platform != std::string::npos) + { + deviceInfo.info.compilationExtraOptions += " -D INTEL_DEVICE"; + deviceInfo.info.isIntelDevice = true; + } + else + { + deviceInfo.info.isIntelDevice = false; + } } } } @@ -471,7 +482,7 @@ DeviceInfo::DeviceInfo() deviceVendorId(-1), maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0), deviceVersionMajor(0), deviceVersionMinor(0), - haveDoubleSupport(false), isUnifiedMemory(false), + haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false), platform(NULL) { // nothing @@ -572,6 +583,8 @@ bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const { switch (featureType) { + case FEATURE_CL_INTEL_DEVICE: + return deviceInfo.isIntelDevice; case FEATURE_CL_DOUBLE: return deviceInfo.haveDoubleSupport; case FEATURE_CL_UNIFIED_MEM: diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 40c1f2ab39..9f71af46dc 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -849,16 +849,138 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq )); args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction )); - const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0"; + if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + //setup local group size + localThreads[0] = 8; + localThreads[1] = 16; + localThreads[2] = 1; + + //init maximal number of workgroups + int WGNumX = 1+(sizev[0].width /(localThreads[0])); + int WGNumY = 1+(sizev[0].height/(localThreads[1])); + int WGNumZ = loopcount; + int WGNum = 0; //accurate number of non -empty workgroups + oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U); + { + cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE_INVALIDATE_REGION, 0, oclWGInfo.step, 0,0,0,&status); + openCLVerifyCall(status); + for(int z=0;z> 16)&0xFFFF; + int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF; + for(int y=0;y=(Height-cascade->orig_window_size.height)) + continue; // no data to process + for(int x=0;x=(Width-cascade->orig_window_size.width)) + continue; // no data to process + + // save no-empty workgroup info into array + pWGInfo[WGNum].s[0] = scaleinfo[z].width_height; + pWGInfo[WGNum].s[1] = (gx << 16) | gy; + pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff; + pWGInfo[WGNum].s[3] = *(int*)&scaleinfo[z].factor; + WGNum++; + } + } + } + openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0)); + pWGInfo = NULL; + } - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options); + // setup global sizes to have linear array of workgroups with WGNum size + globalThreads[0] = localThreads[0]*WGNum; + globalThreads[1] = localThreads[1]; + globalThreads[2] = 1; + +#define NODE_SIZE 12 + // pack node info to have less memory loads + oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U); + { + cl_int status; + cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE_INVALIDATE_REGION, 0, oclNodesPK.step, 0,0,0,&status); + openCLVerifyCall(status); + //use known local data stride to precalulate indexes + int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width); + // check that maximal value is less than maximal unsigned short + assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX); + for(int i = 0;islm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]); + pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]); + pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]); + pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]); + } + //store used float point values for each node + pOut->weight[0] = node[i].weight[0]; + pOut->weight[1] = node[i].weight[1]; + pOut->weight[2] = node[i].weight[2]; + pOut->threshold = node[i].threshold; + pOut->alpha[0] = node[i].alpha[0]; + pOut->alpha[1] = node[i].alpha[1]; + } + openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0)); + pNodesPK = NULL; + } + // add 2 additional buffers (WGinfo and packed nodes) as 2 last args + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart )); + + //form build options for kernel + string options = "-D PACKED_CLASSIFIER"; + options += format(" -D NODE_SIZE=%d",NODE_SIZE); + options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width); + options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); + options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based); + options += format(" -D LSx=%d",localThreads[0]); + options += format(" -D LSy=%d",localThreads[1]); + options += format(" -D SPLITNODE=%d",splitnode); + options += format(" -D SPLITSTAGE=%d",splitstage); + options += format(" -D OUTPUTSZ=%d",outputsz); + + // init candiate global count by 0 + int pattern = 0; + openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL)); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str()); + //read candidate buffer back and put it into host list + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + assert(candidate[0]is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0"; - openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options); - for(int i = 0; i < outputsz; i++) - if(candidate[4 * i + 2] != 0) - allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], - candidate[4 * i + 2], candidate[4 * i + 3])); + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + + for(int i = 0; i < outputsz; i++) + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + } free(scaleinfo); free(candidate); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 10b6804869..e1346405cb 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -905,8 +905,56 @@ namespace cv if (ksize > 0) { - Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); - Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + Context* clCxt = Context::getContext(); + if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 && + src.cols % 8 == 0 && src.rows % 8 == 0 && + ksize==3) + { + Dx.create(src.size(), CV_32FC1); + Dy.create(src.size(), CV_32FC1); + + const unsigned int block_x = 8; + const unsigned int block_y = 8; + + unsigned int src_pitch = src.step; + unsigned int dst_pitch = Dx.cols; + + float _scale = scale; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); + args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale )); + size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1}; + + string option = "-D BLK_X=8 -D BLK_Y=8"; + switch(borderType) + { + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + } + openCLExecuteKernel(src.clCxt, &imgproc_sobel2, "sobel3", gt2, lt2, args, -1, -1, option.c_str() ); + } + else + { + Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); + Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + } } else { diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 22a7fe7cbf..dc7ebaadb0 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -101,6 +101,144 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade float inv_window_area __attribute__((aligned (4))); } GpuHidHaarClassifierCascade; + +#ifdef PACKED_CLASSIFIER +// this code is scalar, one pixel -> one workitem +__kernel void gpuRunHaarClassifierCascadePacked( + global const GpuHidHaarStageClassifier * stagecascadeptr, + global const int4 * info, + global const GpuHidHaarTreeNode * nodeptr, + global const int * restrict sum, + global const float * restrict sqsum, + volatile global int4 * candidate, + const int pixelstep, + const int loopcount, + const int start_stage, + const int split_stage, + const int end_stage, + const int startnode, + const int splitnode, + const int4 p, + const int4 pq, + const float correction, + global const int* pNodesPK, + global const int4* pWGInfo + ) + +{ +// this version used information provided for each workgroup +// no empty WG + int gid = (int)get_group_id(0); + int lid_x = (int)get_local_id(0); + int lid_y = (int)get_local_id(1); + int lid = lid_y*LSx+lid_x; + int4 WGInfo = pWGInfo[gid]; + int GroupX = (WGInfo.y >> 16)&0xFFFF; + int GroupY = (WGInfo.y >> 0 )& 0xFFFF; + int Width = (WGInfo.x >> 16)&0xFFFF; + int Height = (WGInfo.x >> 0 )& 0xFFFF; + int ImgOffset = WGInfo.z; + float ScaleFactor = as_float(WGInfo.w); + +#define DATA_SIZE_X (LSx+WND_SIZE_X) +#define DATA_SIZE_Y (LSy+WND_SIZE_Y) +#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y) + + local int SumL[DATA_SIZE]; + + // read input data window into local mem + for(int i = 0; i=0.f) ? sqrt(variance_norm_factor) : 1.f; + }// end calc variance_norm_factor for all stages + + int result = (1.0f>0.0f); + for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) + {// iterate until candidate is exist + float stage_sum = 0.0f; + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); + for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ ) + { + // simple macro to extract shorts from int +#define M0(_t) ((_t)&0xFFFF) +#define M1(_t) (((_t)>>16)&0xFFFF) + // load packed node data from global memory (L3) into registers + global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE); + int4 n0 = pN[0]; + int4 n1 = pN[1]; + int4 n2 = pN[2]; + float nodethreshold = as_float(n2.y) * variance_norm_factor; + // calc sum of intensity pixels according to node information + float classsum = + (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) + + (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) + + (SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x); + //accumulate stage responce + stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z); + } + result = (stage_sum >= stagethreshold); + }// next stage if needed + + if(result) + {// all stages will be passed and there is a detected face on the tested position + int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info + if(index= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT101 +//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +__kernel void sobel3( + __global uchar* Src, + __global float* DstX, + __global float* DstY, + int width, int height, + uint srcStride, uint dstStride, + float scale + ) +{ + __local float lsmem[BLK_Y+2][BLK_X+2]; + + int lix = get_local_id(0); + int liy = get_local_id(1); + + int gix = get_group_id(0); + int giy = get_group_id(1); + + int id_x = get_global_id(0); + int id_y = get_global_id(1); + + lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]); + + int id_y_h = ADDR_H(id_y-1, 0); + int id_y_b = ADDR_B(id_y+1, height); + + int id_x_l = ADDR_L(id_x-1, 0); + int id_x_r = ADDR_R(id_x+1, width); + + if(liy==0) + { + lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]); + + if(lix==0) + lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]); + else if(lix==BLK_X-1) + lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]); + } + else if(liy==BLK_Y-1) + { + lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]); + + if(lix==0) + lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]); + else if(lix==BLK_X-1) + lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]); + } + + if(lix==0) + lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]); + else if(lix==BLK_X-1) + lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]); + + barrier(CLK_LOCAL_MEM_FENCE); + + float u1 = lsmem[liy][lix]; + float u2 = lsmem[liy][lix+1]; + float u3 = lsmem[liy][lix+2]; + + float m1 = lsmem[liy+1][lix]; + float m2 = lsmem[liy+1][lix+1]; + float m3 = lsmem[liy+1][lix+2]; + + float b1 = lsmem[liy+2][lix]; + float b2 = lsmem[liy+2][lix+1]; + float b3 = lsmem[liy+2][lix+2]; + + //m2 * scale;// + float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 ); + DstX[ id_y * dstStride + id_x ] = dx * scale; + + float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3); + DstY[ id_y * dstStride + id_x ] = dy * scale; +} \ No newline at end of file