diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index 62e69a8a24..081d2343dc 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -127,8 +127,9 @@ namespace cv // currently only support wavefront size queries enum DEVICE_INFO { - WAVEFRONT_SIZE, //in AMD speak - WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak + WAVEFRONT_SIZE, //in AMD speak + WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak + IS_CPU_DEVICE //check if the device is CPU }; //info should have been pre-allocated void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info); diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index b23f00c90d..7a13324077 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -44,7 +44,6 @@ //M*/ #include "precomp.hpp" - using namespace cv; using namespace cv::ocl; using namespace std; @@ -230,7 +229,6 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc } } - void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img) { computeGradient(img, grad, qangle); @@ -1571,6 +1569,27 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b cdescr_size = descr_size; } +static inline int divUp(int total, int grain) +{ + return (total + grain - 1) / grain; +} + +static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName, + size_t globalThreads[3], size_t localThreads[3], + vector< pair > &args) +{ + size_t wave_size = 0; + queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); + if (wave_size <= 16) + { + char build_options[64]; + sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1"); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + } + else + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1); +} + void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists) @@ -1582,8 +1601,10 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - size_t globalThreads[3] = { img_block_width * 32, img_block_height * 2, 1 }; - size_t localThreads[3] = { 32, 2, 1 }; + int blocks_total = img_block_width * img_block_height; + int blocks_in_group = 4; + size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; + size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 }; int grad_quadstep = grad.step >> 2; int qangle_step = qangle.step; @@ -1593,14 +1614,15 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float); int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float); - int smem = hists_size + final_hists_size; + int smem = (hists_size + final_hists_size) * blocks_in_group; - args.push_back( make_pair( sizeof(cl_int), (void *)&width)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_stride_x)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_stride_y)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width)); + args.push_back( make_pair( sizeof(cl_int), (void *)&blocks_in_group)); + args.push_back( make_pair( sizeof(cl_int), (void *)&blocks_total)); args.push_back( make_pair( sizeof(cl_int), (void *)&grad_quadstep)); args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data)); @@ -1609,7 +1631,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y, @@ -1637,7 +1659,7 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y, @@ -1671,7 +1693,7 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, @@ -1702,7 +1724,7 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, @@ -1734,12 +1756,7 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); -} - -static inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img, @@ -1768,7 +1785,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img, @@ -1798,7 +1815,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz) @@ -1815,14 +1832,16 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz float ifx = (float)src.cols / sz.width; float ify = (float)src.rows / sz.height; + int src_step = static_cast(src.step); + int dst_step = static_cast(dst.step); vector< pair > args; args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src_step)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width)); @@ -1830,5 +1849,5 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index ba69573ad6..c9ce89f9f5 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -397,6 +397,15 @@ namespace cv } break; + case IS_CPU_DEVICE: + { + cl_device_type devicetype; + openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], + CL_DEVICE_TYPE, sizeof(cl_device_type), + &devicetype, NULL)); + *(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU); + } + break; default: CV_Error(-1, "Invalid device info type"); break; diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index ce96e3a9e3..87d1d375ef 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -394,7 +394,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be if( rtype < 0 ) rtype = type(); else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels()); //int scn = channels(); int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index db11ed1410..8852facae8 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -53,76 +53,96 @@ //---------------------------------------------------------------------------- // Histogram computation - -__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, - const int cnbins, const int cblock_hist_size, const int img_block_width, - const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, - const float scale, __global float* block_hists, __local float* smem) +// 12 threads for a cell, 12x4 threads per block +__kernel void compute_hists_kernel( + const int cblock_stride_x, const int cblock_stride_y, + const int cnbins, const int cblock_hist_size, const int img_block_width, + const int blocks_in_group, const int blocks_total, + const int grad_quadstep, const int qangle_step, + __global const float* grad, __global const uchar* qangle, + const float scale, __global float* block_hists, __local float* smem) { - const int lidX = get_local_id(0); + const int lx = get_local_id(0); + const int lp = lx / 24; /* local group id */ + const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ + const int gidY = gid / img_block_width; + const int gidX = gid - gidY * img_block_width; + + const int lidX = lx - lp * 24; const int lidY = get_local_id(1); - const int gidX = get_group_id(0); - const int gidY = get_group_id(1); - const int cell_x = lidX / 16; + const int cell_x = lidX / 12; const int cell_y = lidY; - const int cell_thread_x = lidX & 0xF; + const int cell_thread_x = lidX - cell_x * 12; - __local float* hists = smem; - __local float* final_hist = smem + cnbins * 48; + __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * + CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); + __local float* final_hist = hists + cnbins * + (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; const int offset_y = gidY * cblock_stride_y + (cell_y << 2); - __global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1); - __global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1); - - // 12 means that 12 pixels affect on block's cell (in one row) - if (cell_thread_x < 12) - { - __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x; - for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48] = 0.f; + __global const float* grad_ptr = (gid < blocks_total) ? + grad + offset_y * grad_quadstep + (offset_x << 1) : grad; + __global const uchar* qangle_ptr = (gid < blocks_total) ? + qangle + offset_y * qangle_step + (offset_x << 1) : qangle; - const int dist_x = -4 + cell_thread_x - 4 * cell_x; + __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + + cell_thread_x; + for (int bin_id = 0; bin_id < cnbins; ++bin_id) + hist[bin_id * 48] = 0.f; - const int dist_y_begin = -4 - 4 * lidY; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) - { - float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + const int dist_x = -4 + cell_thread_x - 4 * cell_x; + const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); - grad_ptr += grad_quadstep; - qangle_ptr += qangle_step; + const int dist_y_begin = -4 - 4 * lidY; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) + { + float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); + uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + grad_ptr += grad_quadstep; + qangle_ptr += qangle_step; - float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale); - float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f; + int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - hist[bin.x * 48] += gaussian * interp_weight * vote.x; - hist[bin.y * 48] += gaussian * interp_weight * vote.y; - } + float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * + dist_center_x) * scale); + float interp_weight = (8.f - fabs(dist_y + 0.5f)) * + (8.f - fabs(dist_x + 0.5f)) / 64.f; - volatile __local float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) - { - if (cell_thread_x < 6) hist_[0] += hist_[6]; - if (cell_thread_x < 3) hist_[0] += hist_[3]; - if (cell_thread_x == 0) - final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; - } + hist[bin.x * 48] += gaussian * interp_weight * vote.x; + hist[bin.y * 48] += gaussian * interp_weight * vote.y; } - barrier(CLK_LOCAL_MEM_FENCE); - __global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size; + volatile __local float* hist_ = hist; + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) + { + if (cell_thread_x < 6) + hist_[0] += hist_[6]; + barrier(CLK_LOCAL_MEM_FENCE); + if (cell_thread_x < 3) + hist_[0] += hist_[3]; +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (cell_thread_x == 0) + final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = + hist_[0] + hist_[1] + hist_[2]; + } +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; - if (tid < cblock_hist_size) + int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; + if ((tid < cblock_hist_size) && (gid < blocks_total)) + { + __global float* block_hist = block_hists + + (gidY * img_block_width + gidX) * cblock_hist_size; block_hist[tid] = final_hist[tid]; + } } //------------------------------------------------------------- @@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size) unsigned int tid = get_local_id(0); float sum = smem[tid]; - if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 512) + { + if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 256) + { + if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 128) + { + if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + } if (tid < 32) { if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { +#endif if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; } @@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr if (tid < 64) products[tid] = product = product + products[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); + volatile __local float* smem = products; if (tid < 32) { - volatile __local float* smem = products; smem[tid] = product = product + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - volatile __local float* smem = products; +#endif smem[tid] = product = product + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif smem[tid] = product = product + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif smem[tid] = product = product + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif smem[tid] = product = product + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif smem[tid] = product = product + smem[tid + 1]; } @@ -248,8 +331,8 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr // Extract descriptors __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, - const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - __global const float* block_hists, __global float* descriptors) + const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -271,8 +354,8 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in } __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, - const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) + const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, + const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -301,8 +384,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in // Gradients computation __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); @@ -400,8 +483,8 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c } __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + __global const uchar * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index c772be78ac..1043b8410b 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2 } #define BUFFER 64 + +#ifdef CPU +void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + smem3[tid] = val3 += smem3[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + smem3[tid] = val3 += smem3[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = val1 += smem1[tid + 8]; + smem2[tid] = val2 += smem2[tid + 8]; + smem3[tid] = val3 += smem3[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = val1 += smem1[tid + 4]; + smem2[tid] = val2 += smem2[tid + 4]; + smem3[tid] = val3 += smem3[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = val1 += smem1[tid + 2]; + smem2[tid] = val2 += smem2[tid + 2]; + smem3[tid] = val3 += smem3[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = val1 += smem1[tid + 1]; + smem2[BUFFER] = val2 += smem2[tid + 1]; + smem3[BUFFER] = val3 += smem3[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + smem2[tid] = (val2 += smem2[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + smem2[tid] = (val2 += smem2[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + smem2[tid] = (val2 += smem2[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + smem2[tid] = (val2 += smem2[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + smem2[tid] = (val2 += smem2[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + smem2[tid] = (val2 += smem2[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + smem2[tid] = (val2 += smem2[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + smem2[BUFFER] = (val2 += smem2[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce1(float val1, volatile __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#else void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; @@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid) vmem1[tid] = val1 += vmem1[tid + 1]; } } +#endif #define SCALE (1.0f / (1 << 20)) #define THRESHOLD 0.01f @@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); } - +#define GRIDSIZE 3 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else __local float smem1[BUFFER]; __local float smem2[BUFFER]; __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 prevPt = prevPts[gid] / (1 << level); + float2 prevPt = prevPts[gid] / (float2)(1 << level); if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) { @@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float A12 = 0; float A22 = 0; - float I_patch[3][3]; - float dIdx_patch[3][3]; - float dIdy_patch[3][3]; + float I_patch[GRIDSIZE][GRIDSIZE]; + float dIdx_patch[GRIDSIZE][GRIDSIZE]; + float dIdy_patch[GRIDSIZE][GRIDSIZE]; yBase=yid; { @@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], &A11, &A12, &A22); } + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, nextPts[gid] = prevPt; if (calcErr) - err[gid] = smem1[0] / (c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y); +#endif } - } + __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { - __local float smem1[BUFFER]; - __local float smem2[BUFFER]; - __local float smem3[BUFFER]; +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 nextPt = prevPts[gid]/(1<= cols || nextPt.y < 0 || nextPt.y >= rows) { @@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, // extract the patch from the first image, compute covariation matrix of derivatives - float A11 = 0; - float A12 = 0; - float A22 = 0; + float A11 = 0.0f; + float A12 = 0.0f; + float A22 = 0.0f; float4 I_patch[8]; float4 dIdx_patch[8]; @@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, &b1, &b2); } - reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, nextPts[gid] = nextPt; if (calcErr) - err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y); +#endif } } diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 99177c7bd0..196a786d5b 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -226,9 +226,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; - // int Y = get_group_id(1) * ROWSperTHREAD + radius; + // int Y = get_group_id(1) * ROWSperTHREAD + radius; - #define Y (get_group_id(1) * ROWSperTHREAD + radius) +#define Y (get_group_id(1) * ROWSperTHREAD + radius) volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; @@ -251,9 +251,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && Y < cheight - radius) { - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[0]) { disparImage[0] = (unsigned char)(d + minSSD.y); @@ -264,7 +264,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex; + int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE); @@ -278,10 +278,10 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && row < cheight - radius - Y) { int idx = row * cminSSD_step; - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[idx]) { disparImage[disp_step * row] = (unsigned char)(d + minSSD.y); @@ -378,50 +378,50 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in int beg_row = group_id_y * RpT; int end_row = min(beg_row + RpT, disp_rows); - // if (x < disp_cols) - // { - int y = beg_row; +// if (x < disp_cols) +// { + int y = beg_row; - float sum = 0; - float sum_extra = 0; + float sum = 0; + float sum_extra = 0; - for(int i = y - winsz2; i <= y + winsz2; ++i) - { - sum += sobel(input, x - winsz2, i, input_rows, input_cols); - if (cols_extra) - sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); - } + for(int i = y - winsz2; i <= y + winsz2; ++i) + { + sum += sobel(input, x - winsz2, i, input_rows, input_cols); + if (cols_extra) + sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); + } + *cols = sum; + if (cols_extra) + *cols_extra = sum_extra; + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; + if (sum_win < threshold) + disp[y * disp_step + x] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + for(int y = beg_row + 1; y < end_row; ++y) + { + sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + + sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); *cols = sum; + if (cols_extra) + { + sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) + + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); *cols_extra = sum_extra; + } barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; if (sum_win < threshold) disp[y * disp_step + x] = 0; barrier(CLK_LOCAL_MEM_FENCE); - - for(int y = beg_row + 1; y < end_row; ++y) - { - sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + - sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); - *cols = sum; - - if (cols_extra) - { - sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) - + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); - *cols_extra = sum_extra; - } - - barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; - if (sum_win < threshold) - disp[y * disp_step + x] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - } - // } + } + // } } diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index c8d4b52deb..4a6ce1c790 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -16,7 +16,7 @@ // // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com -// Yao Wang, yao@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com // Nathan, liujun@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, @@ -47,6 +47,7 @@ #include "precomp.hpp" + using namespace std; using namespace cv; using namespace cv::ocl; @@ -58,11 +59,7 @@ namespace ocl ///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *pyrlk; extern const char *pyrlk_no_image; -extern const char *operator_setTo; -extern const char *operator_convertTo; -extern const char *operator_copyToM; extern const char *arithm_mul; -extern const char *pyr_down; } } @@ -105,364 +102,7 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe } } -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// ConvertTo //////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -static void convert_run_cus(const oclMat &src, oclMat &dst, double alpha, double beta) -{ - string kernelName = "convert_to_S"; - stringstream idxStr; - idxStr << src.depth(); - kernelName += idxStr.str(); - float alpha_f = (float)alpha, beta_f = (float)beta; - CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols); - vector > args; - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); - int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; - } - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f )); - openCLExecuteKernel2(dst.clCxt , &operator_convertTo, kernelName, globalThreads, - localThreads, args, dst.oclchannels(), dst.depth(), CLFLUSH); -} -void convertTo( const oclMat &src, oclMat &m, int rtype, double alpha = 1, double beta = 0 ); -void convertTo( const oclMat &src, oclMat &dst, int rtype, double alpha, double beta ) -{ - //cout << "cv::ocl::oclMat::convertTo()" << endl; - - bool noScale = fabs(alpha - 1) < std::numeric_limits::epsilon() - && fabs(beta) < std::numeric_limits::epsilon(); - - if( rtype < 0 ) - rtype = src.type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.oclchannels()); - - int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype); - if( sdepth == ddepth && noScale ) - { - src.copyTo(dst); - return; - } - - oclMat temp; - const oclMat *psrc = &src; - if( sdepth != ddepth && psrc == &dst ) - psrc = &(temp = src); - - dst.create( src.size(), rtype ); - convert_run_cus(*psrc, dst, alpha, beta); -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// setTo //////////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -//oclMat &operator = (const Scalar &s) -//{ -// //cout << "cv::ocl::oclMat::=" << endl; -// setTo(s); -// return *this; -//} -static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar, string kernelName) -{ - vector > args; - - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - } - char compile_option[32]; - union sc - { - cl_uchar4 uval; - cl_char4 cval; - cl_ushort4 usval; - cl_short4 shval; - cl_int4 ival; - cl_float4 fval; - cl_double4 dval; - } val; - switch(dst.depth()) - { - case 0: - val.uval.s[0] = saturate_cast(scalar.val[0]); - val.uval.s[1] = saturate_cast(scalar.val[1]); - val.uval.s[2] = saturate_cast(scalar.val[2]); - val.uval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=uchar"); - args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=uchar4"); - args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 1: - val.cval.s[0] = saturate_cast(scalar.val[0]); - val.cval.s[1] = saturate_cast(scalar.val[1]); - val.cval.s[2] = saturate_cast(scalar.val[2]); - val.cval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=char"); - args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=char4"); - args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 2: - val.usval.s[0] = saturate_cast(scalar.val[0]); - val.usval.s[1] = saturate_cast(scalar.val[1]); - val.usval.s[2] = saturate_cast(scalar.val[2]); - val.usval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=ushort"); - args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=ushort4"); - args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 3: - val.shval.s[0] = saturate_cast(scalar.val[0]); - val.shval.s[1] = saturate_cast(scalar.val[1]); - val.shval.s[2] = saturate_cast(scalar.val[2]); - val.shval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=short"); - args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=short4"); - args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 4: - val.ival.s[0] = saturate_cast(scalar.val[0]); - val.ival.s[1] = saturate_cast(scalar.val[1]); - val.ival.s[2] = saturate_cast(scalar.val[2]); - val.ival.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=int"); - args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); - break; - case 2: - sprintf(compile_option, "-D GENTYPE=int2"); - cl_int2 i2val; - i2val.s[0] = val.ival.s[0]; - i2val.s[1] = val.ival.s[1]; - args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=int4"); - args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 5: - val.fval.s[0] = (float)scalar.val[0]; - val.fval.s[1] = (float)scalar.val[1]; - val.fval.s[2] = (float)scalar.val[2]; - val.fval.s[3] = (float)scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=float"); - args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=float4"); - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 6: - val.dval.s[0] = scalar.val[0]; - val.dval.s[1] = scalar.val[1]; - val.dval.s[2] = scalar.val[2]; - val.dval.s[3] = scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=double"); - args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=double4"); - args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unknown depth"); - } -#ifdef CL_VERSION_1_2 - if(dst.offset == 0 && dst.cols == dst.wholecols) - { - clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); - } - else - { - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); - } -#else - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); -#endif -} - -static oclMat &setTo(oclMat &src, const Scalar &scalar) -{ - CV_Assert( src.depth() >= 0 && src.depth() <= 6 ); - CV_DbgAssert( !src.empty()); - - if(src.type() == CV_8UC1) - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask_C1_D0"); - } - else - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask"); - } - - return src; -} - -/////////////////////////////////////////////////////////////////////////// -////////////////////////////////// CopyTo ///////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -// static void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName) -// { -// CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols && -// src.rows == dst.rows && src.cols == dst.cols -// && mask.type() == CV_8UC1); - -// vector > args; - -// std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"}, -// {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"}, -// {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, -// {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} -// }; -// char compile_option[32]; -// sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); -// size_t localThreads[3] = {16, 16, 1}; -// size_t globalThreads[3]; - -// globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0]; -// globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1]; -// globalThreads[2] = 1; - -// int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); -// int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); - -// openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads, -// localThreads, args, -1, -1, compile_option, CLFLUSH); -// } - -static void copyTo(const oclMat &src, oclMat &m ) -{ - CV_DbgAssert(!src.empty()); - m.create(src.size(), src.type()); - openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset, - src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset); -} - -// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask) -// { -// if (mask.empty()) -// { -// copyTo(src, mat); -// } -// else -// { -// mat.create(src.size(), src.type()); -// copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask"); -// } -// } - -static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) +static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) { @@ -470,9 +110,6 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c return; } - //dst.create(src1.size(), src1.type()); - //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && - // src1.rows == src2.rows && src2.rows == dst.rows); CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows); @@ -480,24 +117,8 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c CV_Assert(src1.depth() != CV_8S); Context *clCxt = src1.clCxt; - //int channels = dst.channels(); - //int depth = dst.depth(); - - //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1} - //}; - - //size_t vector_length = vector_lengths[channels-1][depth]; - //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - //int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 16, 16, 1 }; - //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - // divUp(dst.rows, localThreads[1]) * localThreads[1], - // 1 - // }; size_t globalThreads[3] = { src1.cols, src1.rows, 1 @@ -508,67 +129,20 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - //args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + args.push_back( make_pair( sizeof(float), (float *)&scalar )); - //if(_scalar != NULL) - //{ - float scalar1 = *((float *)_scalar); - args.push_back( make_pair( sizeof(float), (float *)&scalar1 )); - //} - - openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH); -} - -static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) -{ - arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar)); -} - -static void pyrdown_run_cus(const oclMat &src, const oclMat &dst) -{ - - CV_Assert(src.type() == dst.type()); - CV_Assert(src.depth() != CV_8S); - - Context *clCxt = src.clCxt; - - string kernelName = "pyrDown"; - - size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { src.cols, dst.rows, 1}; - - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); - - openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH); -} - -static void pyrDown_cus(const oclMat &src, oclMat &dst) -{ - CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); - - dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); - - pyrdown_run_cus(src, dst); + openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth()); } static void lkSparse_run(oclMat &I, oclMat &J, - const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, - int level, /*dim3 block, */dim3 patch, Size winSize, int iters) + const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, + int level, dim3 patch, Size winSize, int iters) { Context *clCxt = I.clCxt; int elemCntPerRow = I.step / I.elemSize(); @@ -603,7 +177,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&level )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); - if (!isImageSupported) + if (!isImageSupported) args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); @@ -613,15 +187,26 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - if(isImageSupported) + bool is_cpu; + queryDeviceInfo(IS_CPU_DEVICE, &is_cpu); + if (is_cpu) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU"); releaseTexture(ITex); releaseTexture(JTex); } else { - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + if(isImageSupported) + { + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + releaseTexture(ITex); + releaseTexture(JTex); + } + else + { + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + } } } @@ -631,7 +216,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { nextPts.release(); status.release(); - //if (err) err->release(); + if (err) err->release(); return; } @@ -657,13 +242,11 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); oclMat temp2 = nextPts.reshape(1); - //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f)); multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f); //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); - //status.setTo(Scalar::all(1)); - setTo(status, Scalar::all(1)); + status.setTo(Scalar::all(1)); bool errMat = false; if (!err) @@ -673,7 +256,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next } else ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); - //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err); // build the image pyramids. @@ -682,25 +264,14 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next if (cn == 1 || cn == 4) { - //prevImg.convertTo(prevPyr_[0], CV_32F); - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(prevImg, prevPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); - } - else - { - //oclMat buf_; - // cvtColor(prevImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(prevPyr_[0], CV_32F); - - // cvtColor(nextImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(nextPyr_[0], CV_32F); + prevImg.convertTo(prevPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); } for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } // dI/dx ~ Ix, dI/dy ~ Iy @@ -709,17 +280,15 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { lkSparse_run(prevPyr_[level], nextPyr_[level], prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols, - level, /*block, */patch, winSize, iters); + level, patch, winSize, iters); } - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); - if(errMat) delete err; } static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, - oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) + oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) { Context *clCxt = I.clCxt; bool isImageSupported = support_image2d(); @@ -754,11 +323,6 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, JTex = (cl_mem)J.data; } - //int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2}; - //const int patchWidth = 16 + 2 * halfWin.x; - //const int patchHeight = 16 + 2 * halfWin.y; - //size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); - vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); @@ -787,15 +351,14 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, if (isImageSupported) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); releaseTexture(ITex); releaseTexture(JTex); } else { - //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); } } @@ -813,23 +376,20 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI nextPyr_.resize(maxLevel + 1); prevPyr_[0] = prevImg; - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); - //uPyr_[1].setTo(Scalar::all(0)); - //vPyr_[1].setTo(Scalar::all(0)); - setTo(uPyr_[1], Scalar::all(0)); - setTo(vPyr_[1], Scalar::all(0)); + uPyr_[1].setTo(Scalar::all(0)); + vPyr_[1].setTo(Scalar::all(0)); Size winSize2i(winSize.width, winSize.height); @@ -846,10 +406,6 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI idx = idx2; } - //uPyr_[idx].copyTo(u); - //vPyr_[idx].copyTo(v); - copyTo(uPyr_[idx], u); - copyTo(vPyr_[idx], v); - - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); + uPyr_[idx].copyTo(u); + vPyr_[idx].copyTo(v); } diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index 856828d6a5..dd46ff6e06 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -115,10 +115,9 @@ int main(int argc, char **argv) std::cout << "platform invalid\n"; return -1; } - if(pid != 0 || device != 0) - { - setDevice(oclinfo[pid], device); - } + + setDevice(oclinfo[pid], device); + cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; return RUN_ALL_TESTS(); }