diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 68f3949a84..1f8afe5590 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -76,6 +76,11 @@ namespace cv int cdescr_width; int cdescr_height; + // A shift value and type that allows qangle to be different + // sizes on different hardware + int qangle_step_shift; + int qangle_type; + void set_up_constants(int nbins, int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y); @@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo hog_device_cpu = true; else hog_device_cpu = false; + } size_t cv::ocl::HOGDescriptor::getDescriptorSize() const @@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride) effect_size = img.size(); grad.create(img.size(), CV_32FC2); - qangle.create(img.size(), CV_8UC2); + qangle.create(img.size(), hog::qangle_type); const size_t block_hist_size = getBlockHistogramSize(); const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride); @@ -1607,6 +1613,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int descr_size = descr_width * nblocks_win_y; cdescr_size = descr_size; + + qangle_type = CV_8UC2; + qangle_step_shift = 0; + // Some Intel devices have low single-byte access performance, + // so we change the datatype here. + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + qangle_type = CV_32SC2; + qangle_step_shift = 2; + } } void cv::ocl::device::hog::compute_hists(int nbins, @@ -1628,7 +1644,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int blocks_total = img_block_width * img_block_height; int grad_quadstep = grad.step >> 2; - int qangle_step = qangle.step; + int qangle_step = qangle.step >> qangle_step_shift; int blocks_in_group = 4; size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; @@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( make_pair( sizeof(cl_int), (void *)&height)); args.push_back( make_pair( sizeof(cl_int), (void *)&width)); @@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step >> 2; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( make_pair( sizeof(cl_int), (void *)&height)); args.push_back( make_pair( sizeof(cl_int), (void *)&width)); diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 0d2f26f966..60d7346e5a 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -50,6 +50,14 @@ #define NTHREADS 256 #define CV_PI_F 3.1415926535897932384626433832795f +#ifdef INTEL_DEVICE +#define QANGLE_TYPE int +#define QANGLE_TYPE2 int2 +#else +#define QANGLE_TYPE uchar +#define QANGLE_TYPE2 uchar2 +#endif + //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block @@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel( const int cnbins, const int cblock_hist_size, const int img_block_width, const int blocks_in_group, const int blocks_total, const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, + __global const float* grad, __global const QANGLE_TYPE* qangle, __global const float* gauss_w_lut, __global float* block_hists, __local float* smem) { @@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel( __global const float* grad_ptr = (gid < blocks_total) ? grad + offset_y * grad_quadstep + (offset_x << 1) : grad; - __global const uchar* qangle_ptr = (gid < blocks_total) ? + __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ? qangle + offset_y * qangle_step + (offset_x << 1) : qangle; __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + @@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel( for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) { float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]); grad_ptr += grad_quadstep; qangle_ptr += qangle_step; @@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel( __kernel void compute_gradients_8UC4_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); @@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel( __kernel void compute_gradients_8UC1_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, + __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0);