Merge pull request #2234 from KonstantinMatskevich:ocl_tapi_hog

pull/2192/head
Andrey Pavlenko 11 years ago committed by OpenCV Buildbot
commit 22146e4b18
  1. 24
      modules/objdetect/include/opencv2/objdetect.hpp
  2. 94
      modules/objdetect/perf/opencl/perf_hogdetect.cpp
  3. 579
      modules/objdetect/src/hog.cpp
  4. 726
      modules/objdetect/src/opencl/objdetect_hog.cl
  5. 121
      modules/objdetect/test/opencl/test_hogdetector.cpp

@ -285,10 +285,11 @@ public:
CV_WRAP virtual void save(const String& filename, const String& objname = String()) const;
virtual void copyTo(HOGDescriptor& c) const;
CV_WRAP virtual void compute(const Mat& img,
CV_WRAP virtual void compute(InputArray img,
CV_OUT std::vector<float>& descriptors,
Size winStride = Size(), Size padding = Size(),
const std::vector<Point>& locations = std::vector<Point>()) const;
//with found weights output
CV_WRAP virtual void detect(const Mat& img, CV_OUT std::vector<Point>& foundLocations,
CV_OUT std::vector<double>& weights,
@ -300,13 +301,14 @@ public:
double hitThreshold = 0, Size winStride = Size(),
Size padding = Size(),
const std::vector<Point>& searchLocations=std::vector<Point>()) const;
//with result weights output
CV_WRAP virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
CV_OUT std::vector<double>& foundWeights, double hitThreshold = 0,
Size winStride = Size(), Size padding = Size(), double scale = 1.05,
double finalThreshold = 2.0,bool useMeanshiftGrouping = false) const;
//without found weights output
virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
double hitThreshold = 0, Size winStride = Size(),
Size padding = Size(), double scale = 1.05,
double finalThreshold = 2.0, bool useMeanshiftGrouping = false) const;
@ -328,25 +330,27 @@ public:
CV_PROP double L2HysThreshold;
CV_PROP bool gammaCorrection;
CV_PROP std::vector<float> svmDetector;
UMat oclSvmDetector;
float free_coef;
CV_PROP int nlevels;
// evaluate specified ROI and return confidence value for each location
virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
// evaluate specified ROI and return confidence value for each location
virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
double hitThreshold = 0, cv::Size winStride = Size(),
cv::Size padding = Size()) const;
// evaluate specified ROI and return confidence value for each location in multiple scales
virtual void detectMultiScaleROI(const cv::Mat& img,
// evaluate specified ROI and return confidence value for each location in multiple scales
virtual void detectMultiScaleROI(const cv::Mat& img,
CV_OUT std::vector<cv::Rect>& foundLocations,
std::vector<DetectionROI>& locations,
double hitThreshold = 0,
int groupThreshold = 0) const;
// read/parse Dalal's alt model file
void readALTModel(String modelfile);
void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
// read/parse Dalal's alt model file
void readALTModel(String modelfile);
void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
};

@ -0,0 +1,94 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Fangfang Bai, fangfang@multicorewareinc.com
// Jin Ma, jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "perf_precomp.hpp"
#include "opencv2/ts/ocl_perf.hpp"
#ifdef HAVE_OPENCL
namespace cvtest {
namespace ocl {
///////////// HOG////////////////////////
struct RectLess :
public std::binary_function<cv::Rect, cv::Rect, bool>
{
bool operator()(const cv::Rect& a,
const cv::Rect& b) const
{
if (a.x != b.x)
return a.x < b.x;
else if (a.y != b.y)
return a.y < b.y;
else if (a.width != b.width)
return a.width < b.width;
else
return a.height < b.height;
}
};
OCL_PERF_TEST(HOGFixture, HOG)
{
UMat src;
imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE).copyTo(src);
ASSERT_FALSE(src.empty());
vector<cv::Rect> found_locations;
declare.in(src);
HOGDescriptor hog;
hog.setSVMDetector(hog.getDefaultPeopleDetector());
OCL_TEST_CYCLE() hog.detectMultiScale(src, found_locations);
std::sort(found_locations.begin(), found_locations.end(), RectLess());
SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
}
}
}
#endif

@ -42,6 +42,7 @@
#include "precomp.hpp"
#include "opencv2/core/core_c.h"
#include "opencl_kernels.hpp"
#include <cstdio>
#include <iterator>
@ -58,6 +59,29 @@
namespace cv
{
#define NTHREADS 256
enum {DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW};
static int numPartsWithin(int size, int part_size, int stride)
{
return (size - part_size + stride) / stride;
}
static Size numPartsWithin(cv::Size size, cv::Size part_size,
cv::Size stride)
{
return Size(numPartsWithin(size.width, part_size.width, stride.width),
numPartsWithin(size.height, part_size.height, stride.height));
}
static size_t getBlockHistogramSize(Size block_size, Size cell_size, int nbins)
{
Size cells_per_block = Size(block_size.width / cell_size.width,
block_size.height / cell_size.height);
return (size_t)(nbins * cells_per_block.area());
}
size_t HOGDescriptor::getDescriptorSize() const
{
CV_Assert(blockSize.width % cellSize.width == 0 &&
@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const
void HOGDescriptor::setSVMDetector(InputArray _svmDetector)
{
_svmDetector.getMat().convertTo(svmDetector, CV_32F);
CV_Assert( checkDetectorSize() );
CV_Assert(checkDetectorSize());
Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1);
size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
cv::Size blocks_per_img = numPartsWithin(winSize, blockSize, blockStride);
for (int i = 0; i < blocks_per_img.height; ++i)
for (int j = 0; j < blocks_per_img.width; ++j)
{
const float *src = &svmDetector[0] + (j * blocks_per_img.height + i) * block_hist_size;
float *dst = (float*)detector_reordered.data + (i * blocks_per_img.width + j) * block_hist_size;
for (size_t k = 0; k < block_hist_size; ++k)
dst[k] = src[k];
}
size_t descriptor_size = getDescriptorSize();
free_coef = svmDetector.size() > descriptor_size ? svmDetector[descriptor_size] : 0;
detector_reordered.copyTo(oclSvmDetector);
}
#define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog"
@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b)
return a;
}
void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
#ifdef HAVE_OPENCL
static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, float angle_scale,
UMat grad, UMat qangle, bool correct_gamma, int nbins)
{
ocl::Kernel k("compute_gradients_8UC1_kernel", ocl::objdetect::objdetect_hog_oclsrc);
if(k.empty())
return false;
UMat img = _img.getUMat();
size_t localThreads[3] = { NTHREADS, 1, 1 };
size_t globalThreads[3] = { width, height, 1 };
char correctGamma = (correct_gamma) ? 1 : 0;
int grad_quadstep = (int)grad.step >> 3;
int qangle_step_shift = 0;
int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
int idx = 0;
idx = k.set(idx, height);
idx = k.set(idx, width);
idx = k.set(idx, (int)img.step1());
idx = k.set(idx, grad_quadstep);
idx = k.set(idx, qangle_step);
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(img));
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(grad));
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(qangle));
idx = k.set(idx, angle_scale);
idx = k.set(idx, correctGamma);
idx = k.set(idx, nbins);
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_computeGradient(InputArray img, UMat grad, UMat qangle, int nbins, Size effect_size, bool gamma_correction)
{
float angleScale = (float)(nbins / CV_PI);
return ocl_compute_gradients_8UC1(effect_size.height, effect_size.width, img,
angleScale, grad, qangle, gamma_correction, nbins);
}
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width,
UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
{
ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
if(k.empty())
return false;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)/block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
int blocks_total = img_block_width * img_block_height;
int qangle_step_shift = 0;
int grad_quadstep = (int)grad.step >> 2;
int qangle_step = (int)qangle.step >> qangle_step_shift;
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
size_t globalThreads[3] = {((img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group) * localThreads[0], 2, 1 };
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
int smem = (hists_size + final_hists_size) * blocks_in_group;
int idx = 0;
idx = k.set(idx, block_stride_x);
idx = k.set(idx, block_stride_y);
idx = k.set(idx, nbins);
idx = k.set(idx, (int)block_hist_size);
idx = k.set(idx, img_block_width);
idx = k.set(idx, blocks_in_group);
idx = k.set(idx, blocks_total);
idx = k.set(idx, grad_quadstep);
idx = k.set(idx, qangle_step);
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(grad));
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(qangle));
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(gauss_w_lut));
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(block_hists));
idx = k.set(idx, (void*)NULL, (size_t)smem);
return k.run(2, globalThreads, localThreads, false);
}
static int power_2up(unsigned int n)
{
for(unsigned int i = 1; i<=1024; i<<=1)
if(n < i)
return i;
return -1; // Input is too big
}
static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, UMat block_hists, float threshold)
{
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
/ block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
/ block_stride_y;
int nthreads;
size_t globalThreads[3] = { 1, 1, 1 };
size_t localThreads[3] = { 1, 1, 1 };
int idx = 0;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
ocl::Kernel k;
if ( nbins == 9 )
{
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
int blocks_in_group = NTHREADS / block_hist_size;
nthreads = blocks_in_group * block_hist_size;
int num_groups = (img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group;
globalThreads[0] = nthreads * num_groups;
localThreads[0] = nthreads;
}
else
{
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
nthreads = power_2up(block_hist_size);
globalThreads[0] = img_block_width * nthreads;
globalThreads[1] = img_block_height;
localThreads[0] = nthreads;
if ((nthreads < 32) || (nthreads > 512) )
return false;
idx = k.set(idx, nthreads);
idx = k.set(idx, block_hist_size);
idx = k.set(idx, img_block_width);
}
idx = k.set(idx, ocl::KernelArg::PtrReadWrite(block_hists));
idx = k.set(idx, threshold);
idx = k.set(idx, (void*)NULL, nthreads * sizeof(float));
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, UMat block_hists, UMat descriptors,
int block_hist_size, int descr_size, int descr_width)
{
ocl::Kernel k("extract_descrs_by_rows_kernel", ocl::objdetect::objdetect_hog_oclsrc);
if(k.empty())
return false;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
int descriptors_quadstep = (int)descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
size_t localThreads[3] = { NTHREADS, 1, 1 };
int idx = 0;
idx = k.set(idx, block_hist_size);
idx = k.set(idx, descriptors_quadstep);
idx = k.set(idx, descr_size);
idx = k.set(idx, descr_width);
idx = k.set(idx, img_block_width);
idx = k.set(idx, win_block_stride_x);
idx = k.set(idx, win_block_stride_y);
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, UMat block_hists, UMat descriptors,
int block_hist_size, int descr_size, int nblocks_win_x, int nblocks_win_y)
{
ocl::Kernel k("extract_descrs_by_cols_kernel", ocl::objdetect::objdetect_hog_oclsrc);
if(k.empty())
return false;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
int descriptors_quadstep = (int)descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
size_t localThreads[3] = { NTHREADS, 1, 1 };
int idx = 0;
idx = k.set(idx, block_hist_size);
idx = k.set(idx, descriptors_quadstep);
idx = k.set(idx, descr_size);
idx = k.set(idx, nblocks_win_x);
idx = k.set(idx, nblocks_win_y);
idx = k.set(idx, img_block_width);
idx = k.set(idx, win_block_stride_x);
idx = k.set(idx, win_block_stride_y);
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
{
Size imgSize = _img.size();
Size effect_size = imgSize;
UMat grad(imgSize, CV_32FC2);
UMat qangle(imgSize, CV_8UC2);
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
UMat labels(1, wins_per_img.area(), CV_8U);
float scale = 1.f / (2.f * sigma * sigma);
Mat gaussian_lut(1, 512, CV_32FC1);
int idx = 0;
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
if(!ocl_computeGradient(_img, grad, qangle, nbins, effect_size, gammaCorrection))
return false;
UMat gauss_w_lut;
gaussian_lut.copyTo(gauss_w_lut);
if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
return false;
if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
effect_size.width, block_hists, (float)L2HysThreshold))
return false;
Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
wins_per_img = numPartsWithin(effect_size, winSize, win_stride);
int descr_size = blocks_per_win.area()*(int)block_hist_size;
int descr_width = (int)block_hist_size*blocks_per_win.width;
UMat descriptors(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
switch (descr_format)
{
case DESCR_FORMAT_ROW_BY_ROW:
if(!ocl_extract_descrs_by_rows(winSize.height, winSize.width,
blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height,
effect_size.width, block_hists, descriptors, (int)block_hist_size, descr_size, descr_width))
return false;
break;
case DESCR_FORMAT_COL_BY_COL:
if(!ocl_extract_descrs_by_cols(winSize.height, winSize.width,
blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, effect_size.width,
block_hists, descriptors, (int)block_hist_size, descr_size, blocks_per_win.width, blocks_per_win.height))
return false;
break;
default:
return false;
}
descriptors.reshape(1, (int)descriptors.total()).getMat(ACCESS_READ).copyTo(_descriptors);
return true;
}
#endif //HAVE_OPENCL
void HOGDescriptor::compute(InputArray _img, std::vector<float>& descriptors,
Size winStride, Size padding, const std::vector<Point>& locations) const
{
if( winStride == Size() )
@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height));
Size imgSize = _img.size();
size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2);
CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(),
ocl_compute(_img, winStride, descriptors, DESCR_FORMAT_COL_BY_COL, blockSize,
cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold))
Mat img = _img.getMat();
HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
if( !nwindows )
@ -1263,20 +1622,215 @@ private:
Mutex* mtx;
};
#ifdef HAVE_OPENCL
static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width,
const UMat& block_hists, UMat detector,
float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
{
int nthreads;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
ocl::Kernel k;
int idx = 0;
switch (descr_size.width)
{
case 180:
nthreads = 180;
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
idx = k.set(idx, descr_size.width);
idx = k.set(idx, descr_size.height);
break;
case 252:
nthreads = 256;
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
idx = k.set(idx, descr_size.width);
idx = k.set(idx, descr_size.height);
break;
default:
nthreads = 256;
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
idx = k.set(idx, descr_size.area());
idx = k.set(idx, descr_size.height);
}
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
size_t localThreads[3] = { nthreads, 1, 1 };
idx = k.set(idx, block_hist_size);
idx = k.set(idx, img_win_width);
idx = k.set(idx, img_block_width);
idx = k.set(idx, win_block_stride_x);
idx = k.set(idx, win_block_stride_y);
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(detector));
idx = k.set(idx, free_coef);
idx = k.set(idx, threshold);
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(labels));
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_detect(InputArray img, std::vector<Point> &hits, double hit_threshold, Size win_stride,
const UMat& oclSvmDetector, Size blockSize, Size cellSize, int nbins, Size blockStride, Size winSize,
bool gammaCorrection, double L2HysThreshold, float sigma, float free_coef)
{
hits.clear();
if (oclSvmDetector.empty())
return false;
Size imgSize = img.size();
Size effect_size = imgSize;
UMat grad(imgSize, CV_32FC2);
UMat qangle(imgSize, CV_8UC2);
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
UMat labels(1, wins_per_img.area(), CV_8U);
float scale = 1.f / (2.f * sigma * sigma);
Mat gaussian_lut(1, 512, CV_32FC1);
int idx = 0;
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
if(!ocl_computeGradient(img, grad, qangle, nbins, effect_size, gammaCorrection))
return false;
UMat gauss_w_lut;
gaussian_lut.copyTo(gauss_w_lut);
if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
return false;
if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
effect_size.width, block_hists, (float)L2HysThreshold))
return false;
Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
Size descr_size((int)block_hist_size*blocks_per_win.width, blocks_per_win.height);
if(!ocl_classify_hists(winSize.height, winSize.width, blockStride.height,
blockStride.width, win_stride.height, win_stride.width,
effect_size.height, effect_size.width, block_hists, oclSvmDetector,
free_coef, (float)hit_threshold, labels, descr_size, (int)block_hist_size))
return false;
Mat labels_host = labels.getMat(ACCESS_READ);
unsigned char *vec = labels_host.ptr();
for (int i = 0; i < wins_per_img.area(); i++)
{
int y = i / wins_per_img.width;
int x = i - wins_per_img.width * y;
if (vec[i])
{
hits.push_back(Point(x * win_stride.width, y * win_stride.height));
}
}
return true;
}
static bool ocl_detectMultiScale(InputArray _img, std::vector<Rect> &found_locations, std::vector<double>& level_scale,
double hit_threshold, Size win_stride, double group_threshold,
const UMat& oclSvmDetector, Size blockSize, Size cellSize,
int nbins, Size blockStride, Size winSize, bool gammaCorrection,
double L2HysThreshold, float sigma, float free_coef)
{
std::vector<Rect> all_candidates;
std::vector<Point> locations;
UMat image_scale;
Size imgSize = _img.size();
image_scale.create(imgSize, _img.type());
for (size_t i = 0; i<level_scale.size() ; i++)
{
double scale = level_scale[i];
Size effect_size = Size(cvRound(imgSize.width / scale), cvRound(imgSize.height / scale));
if (effect_size == imgSize)
{
if(!ocl_detect(_img, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
return false;
}
else
{
resize(_img, image_scale, effect_size);
if(!ocl_detect(image_scale, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
return false;
}
Size scaled_win_size(cvRound(winSize.width * scale),
cvRound(winSize.height * scale));
for (size_t j = 0; j < locations.size(); j++)
all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
}
found_locations.assign(all_candidates.begin(), all_candidates.end());
cv::groupRectangles(found_locations, (int)group_threshold, 0.2);
return true;
}
#endif //HAVE_OPENCL
void HOGDescriptor::detectMultiScale(
const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
InputArray _img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
double hitThreshold, Size winStride, Size padding,
double scale0, double finalThreshold, bool useMeanshiftGrouping) const
{
double scale = 1.;
int levels = 0;
Size imgSize = _img.size();
std::vector<double> levelScale;
for( levels = 0; levels < nlevels; levels++ )
{
levelScale.push_back(scale);
if( cvRound(img.cols/scale) < winSize.width ||
cvRound(img.rows/scale) < winSize.height ||
if( cvRound(imgSize.width/scale) < winSize.width ||
cvRound(imgSize.height/scale) < winSize.height ||
scale0 <= 1 )
break;
scale *= scale0;
@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale(
levels = std::max(levels, 1);
levelScale.resize(levels);
if(winStride == Size())
winStride = blockStride;
CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && scale0 > 1 && winStride.width % blockStride.width == 0 &&
winStride.height % blockStride.height == 0 && padding == Size(0,0) && _img.isUMat(),
ocl_detectMultiScale(_img, foundLocations, levelScale, hitThreshold, winStride, finalThreshold, oclSvmDetector,
blockSize, cellSize, nbins, blockStride, winSize, gammaCorrection, L2HysThreshold, (float)getWinSigma(), free_coef));
std::vector<Rect> allCandidates;
std::vector<double> tempScales;
std::vector<double> tempWeights;
std::vector<double> foundScales;
Mutex mtx;
Mutex mtx;
Mat img = _img.getMat();
Range range(0, (int)levelScale.size());
HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales);
parallel_for_(range, invoker);
@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale(
groupRectangles(foundLocations, foundWeights, (int)finalThreshold, 0.2);
}
void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations,
void HOGDescriptor::detectMultiScale(InputArray img, std::vector<Rect>& foundLocations,
double hitThreshold, Size winStride, Size padding,
double scale0, double finalThreshold, bool useMeanshiftGrouping) const
{

@ -0,0 +1,726 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Wenju He, wenju@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
#define NTHREADS 256
#define CV_PI_F 3.1415926535897932384626433832795f
#ifdef INTEL_DEVICE
#define QANGLE_TYPE int
#define QANGLE_TYPE2 int2
#else
#define QANGLE_TYPE uchar
#define QANGLE_TYPE2 uchar2
#endif
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
// Use pre-computed gaussian and interp_weight lookup tables
__kernel void compute_hists_lut_kernel(
const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int blocks_in_group, const int blocks_total,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const QANGLE_TYPE* qangle,
__global const float* gauss_w_lut,
__global float* block_hists, __local float* smem)
{
const int lx = get_local_id(0);
const int lp = lx / 24; /* local group id */
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
const int gidY = gid / img_block_width;
const int gidX = gid - gidY * img_block_width;
const int lidX = lx - lp * 24;
const int lidY = get_local_id(1);
const int cell_x = lidX / 12;
const int cell_y = lidY;
const int cell_thread_x = lidX - cell_x * 12;
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
__local float* final_hist = hists + cnbins *
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
__global const float* grad_ptr = (gid < blocks_total) ?
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
__global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48] = 0.f;
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
const int dist_y_begin = -4 - 4 * lidY;
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
float gaussian = gauss_w_lut[idx];
idx = (dist_y + 8) * 16 + (dist_x + 8);
float interp_weight = gauss_w_lut[256+idx];
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if (cell_thread_x < 6)
hist_[0] += hist_[6];
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if ((tid < cblock_hist_size) && (gid < blocks_total))
{
__global float* block_hist = block_hists +
(gidY * img_block_width + gidX) * cblock_hist_size;
block_hist[tid] = final_hist[tid];
}
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
// optimized for the case of 9 bins
__kernel void normalize_hists_36_kernel(__global float* block_hists,
const float threshold, __local float *squares)
{
const int tid = get_local_id(0);
const int gid = get_global_id(0);
const int bid = tid / 36; /* block-hist id, (0 - 6) */
const int boffset = bid * 36; /* block-hist offset in the work-group */
const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
float elem = block_hists[gid];
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
__local float* smem = squares + boffset;
float sum = smem[hid];
if (hid < 18)
smem[hid] = sum = sum + smem[hid + 18];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 9)
smem[hid] = sum = sum + smem[hid + 9];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 4)
smem[hid] = sum + smem[hid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
elem = elem / (sqrt(sum) + 3.6f);
elem = min(elem, threshold);
barrier(CLK_LOCAL_MEM_FENCE);
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[hid];
if (hid < 18)
smem[hid] = sum = sum + smem[hid + 18];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 9)
smem[hid] = sum = sum + smem[hid + 9];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 4)
smem[hid] = sum + smem[hid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
//
inline float reduce_smem(volatile __local float* smem, int size)
{
unsigned int tid = get_local_id(0);
float sum = smem[tid];
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE); }
#ifdef CPU
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE); }
#else
if (tid < 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
}
#endif
return sum;
}
__kernel void normalize_hists_kernel(
const int nthreads, const int block_hist_size, const int img_block_width,
__global float* block_hists, const float threshold, __local float *squares)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global float* hist = block_hists + (gidY * img_block_width + gidX) *
block_hist_size + tid;
float elem = 0.f;
if (tid < block_hist_size)
elem = hist[0];
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
float sum = reduce_smem(squares, nthreads);
float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
elem = min(elem * scale, threshold);
barrier(CLK_LOCAL_MEM_FENCE);
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
sum = reduce_smem(squares, nthreads);
scale = 1.0f / (sqrt(sum) + 1e-3f);
if (tid < block_hist_size)
hist[0] = elem * scale;
}
//---------------------------------------------------------------------
// Linear SVM based classification
// 48x96 window, 9 bins and default parameters
// 180 threads, each thread corresponds to a bin in a row
__kernel void classify_hists_180_kernel(
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
for (int i = 0; i < cdescr_height; i++)
{
product += coefs[i * cdescr_width + tid] *
hist[i * img_block_width * cblock_hist_size + tid];
}
__local float products[180];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 90) products[tid] = product = product + products[tid + 90];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 45) products[tid] = product = product + products[tid + 45];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 13)
{
smem[tid] = product = product + smem[tid + 32];
}
#if WAVE_SIZE < 32
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 16)
{
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
//---------------------------------------------------------------------
// Linear SVM based classification
// 64x128 window, 9 bins and default parameters
// 256 threads, 252 of them are used
__kernel void classify_hists_252_kernel(
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
if (tid < cdescr_width)
{
for (int i = 0; i < cdescr_height; i++)
product += coefs[i * cdescr_width + tid] *
hist[i * img_block_width * cblock_hist_size + tid];
}
__local float products[NTHREADS];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
//---------------------------------------------------------------------
// Linear SVM based classification
// 256 threads
__kernel void classify_hists_kernel(
const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
for (int i = tid; i < cdescr_size; i += NTHREADS)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] *
hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
__local float products[NTHREADS];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
smem[tid] = product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
//----------------------------------------------------------------------------
// Extract descriptors
__kernel void extract_descrs_by_rows_kernel(
const int cblock_hist_size, const int descriptors_quadstep,
const int cdescr_size, const int cdescr_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
__global float* descriptor = descriptors +
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
}
__kernel void extract_descrs_by_cols_kernel(
const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
__global float* descriptor = descriptors +
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
{
int block_idx = i / cblock_hist_size;
int idx_in_block = i - block_idx * cblock_hist_size;
int y = block_idx / cnblocks_win_x;
int x = block_idx - y * cnblocks_win_x;
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
}
}
//----------------------------------------------------------------------------
// Gradients computation
__kernel void compute_gradients_8UC4_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
const int tid = get_local_id(0);
const int gSizeX = get_local_size(0);
const int gidY = get_group_id(1);
__global const uchar4* row = img + gidY * img_step;
__local float sh_row[(NTHREADS + 2) * 3];
uchar4 val;
if (x < width)
val = row[x];
else
val = row[width - 2];
sh_row[tid + 1] = val.x;
sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
if (tid == 0)
{
val = row[max(x - 1, 1)];
sh_row[0] = val.x;
sh_row[(NTHREADS + 2)] = val.y;
sh_row[2 * (NTHREADS + 2)] = val.z;
}
if (tid == gSizeX - 1)
{
val = row[min(x + 1, width - 2)];
sh_row[gSizeX + 1] = val.x;
sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < width)
{
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
sh_row[tid + 2 * (NTHREADS + 2)]);
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
float3 dx;
if (correct_gamma == 1)
dx = sqrt(b) - sqrt(a);
else
dx = b - a;
float3 dy = (float3) 0.f;
if (gidY > 0 && gidY < height - 1)
{
a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
if (correct_gamma == 1)
dy = sqrt(b) - sqrt(a);
else
dy = b - a;
}
float best_dx = dx.x;
float best_dy = dy.x;
float mag0 = dx.x * dx.x + dy.x * dy.x;
float mag1 = dx.y * dx.y + dy.y * dy.y;
if (mag0 < mag1)
{
best_dx = dx.y;
best_dy = dy.y;
mag0 = mag1;
}
mag1 = dx.z * dx.z + dy.z * dy.z;
if (mag0 < mag1)
{
best_dx = dx.z;
best_dy = dy.z;
mag0 = mag1;
}
mag0 = sqrt(mag0);
float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floor(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
qangle[(gidY * qangle_step + x) << 1] = hidx;
qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
}
}
__kernel void compute_gradients_8UC1_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
const int tid = get_local_id(0);
const int gSizeX = get_local_size(0);
const int gidY = get_group_id(1);
__global const uchar* row = img + gidY * img_step;
__local float sh_row[NTHREADS + 2];
if (x < width)
sh_row[tid + 1] = row[x];
else
sh_row[tid + 1] = row[width - 2];
if (tid == 0)
sh_row[0] = row[max(x - 1, 1)];
if (tid == gSizeX - 1)
sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
barrier(CLK_LOCAL_MEM_FENCE);
if (x < width)
{
float dx;
if (correct_gamma == 1)
dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
else
dx = sh_row[tid + 2] - sh_row[tid];
float dy = 0.f;
if (gidY > 0 && gidY < height - 1)
{
float a = (float) img[ (gidY + 1) * img_step + x ];
float b = (float) img[ (gidY - 1) * img_step + x ];
if (correct_gamma == 1)
dy = sqrt(a) - sqrt(b);
else
dy = a - b;
}
float mag = sqrt(dx * dx + dy * dy);
float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floor(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
qangle[ (gidY * qangle_step + x) << 1 ] = hidx;
qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
}
}

@ -0,0 +1,121 @@
///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Shengen Yan, yanshengen@gmail.com
// Jiang Liyuan,jlyuan001.good@163.com
// Rock Li, Rock.Li@amd.com
// Zailong Wu, bullet@yeah.net
// Yao Wang, bitwangyaoyao@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
#include "opencv2/ts/ocl_test.hpp"
#ifdef HAVE_OPENCL
namespace cvtest {
namespace ocl {
///////////////////// HOG /////////////////////////////
PARAM_TEST_CASE(HOG, Size, MatType)
{
Size winSize;
int type;
Mat img;
UMat uimg;
virtual void SetUp()
{
winSize = GET_PARAM(0);
type = GET_PARAM(1);
img = readImage("cascadeandhog/images/image_00000000_0.png", IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
img.copyTo(uimg);
}
};
OCL_TEST_P(HOG, GetDescriptors)
{
HOGDescriptor hog;
hog.gammaCorrection = true;
hog.setSVMDetector(hog.getDefaultPeopleDetector());
std::vector<float> cpu_descriptors;
std::vector<float> gpu_descriptors;
OCL_OFF(hog.compute(img, cpu_descriptors, hog.winSize));
OCL_ON(hog.compute(uimg, gpu_descriptors, hog.winSize));
Mat cpu_desc(cpu_descriptors), gpu_desc(gpu_descriptors);
EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1);
}
OCL_TEST_P(HOG, Detect)
{
HOGDescriptor hog;
hog.winSize = winSize;
hog.gammaCorrection = true;
if (winSize.width == 48 && winSize.height == 96)
hog.setSVMDetector(hog.getDaimlerPeopleDetector());
else
hog.setSVMDetector(hog.getDefaultPeopleDetector());
std::vector<Rect> cpu_found;
std::vector<Rect> gpu_found;
OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0);
}
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
testing::Values(Size(64, 128), Size(48, 96)),
testing::Values( MatType(CV_8UC1) ) ) );
}}
#endif
Loading…
Cancel
Save