mirror of https://github.com/opencv/opencv.git
parent
a2df490914
commit
64e9cf5d75
6 changed files with 4597 additions and 0 deletions
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,450 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Wenju He, wenju@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
|
||||
#define CELL_WIDTH 8 |
||||
#define CELL_HEIGHT 8 |
||||
#define CELLS_PER_BLOCK_X 2 |
||||
#define CELLS_PER_BLOCK_Y 2 |
||||
#define NTHREADS 256 |
||||
#define CV_PI_F 3.1415926535897932384626433832795f |
||||
|
||||
//---------------------------------------------------------------------------- |
||||
// Histogram computation |
||||
|
||||
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, |
||||
const int cnbins, const int cblock_hist_size, const int img_block_width, |
||||
const int grad_quadstep, const int qangle_step, |
||||
__global const float* grad, __global const uchar* qangle, |
||||
const float scale, __global float* block_hists, __local float* smem) |
||||
{ |
||||
const int lidX = get_local_id(0); |
||||
const int lidY = get_local_id(1); |
||||
const int gidX = get_group_id(0); |
||||
const int gidY = get_group_id(1); |
||||
|
||||
const int cell_x = lidX / 16; |
||||
const int cell_y = lidY; |
||||
const int cell_thread_x = lidX & 0xF; |
||||
|
||||
__local float* hists = smem; |
||||
__local float* final_hist = smem + cnbins * 48; |
||||
|
||||
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; |
||||
const int offset_y = gidY * cblock_stride_y + (cell_y << 2); |
||||
|
||||
__global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1); |
||||
__global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1); |
||||
|
||||
// 12 means that 12 pixels affect on block's cell (in one row) |
||||
if (cell_thread_x < 12) |
||||
{ |
||||
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x; |
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id) |
||||
hist[bin_id * 48] = 0.f; |
||||
|
||||
const int dist_x = -4 + cell_thread_x - 4 * cell_x; |
||||
|
||||
const int dist_y_begin = -4 - 4 * lidY; |
||||
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) |
||||
{ |
||||
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); |
||||
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); |
||||
|
||||
grad_ptr += grad_quadstep; |
||||
qangle_ptr += qangle_step; |
||||
|
||||
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); |
||||
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); |
||||
|
||||
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale); |
||||
float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f; |
||||
|
||||
hist[bin.x * 48] += gaussian * interp_weight * vote.x; |
||||
hist[bin.y * 48] += gaussian * interp_weight * vote.y; |
||||
} |
||||
|
||||
volatile __local float* hist_ = hist; |
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) |
||||
{ |
||||
if (cell_thread_x < 6) hist_[0] += hist_[6]; |
||||
if (cell_thread_x < 3) hist_[0] += hist_[3]; |
||||
if (cell_thread_x == 0) |
||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
__global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size; |
||||
|
||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; |
||||
if (tid < cblock_hist_size) |
||||
block_hist[tid] = final_hist[tid]; |
||||
} |
||||
|
||||
//------------------------------------------------------------- |
||||
// Normalization of histograms via L2Hys_norm |
||||
// |
||||
float reduce_smem(volatile __local float* smem, int size) |
||||
{ |
||||
unsigned int tid = get_local_id(0); |
||||
float sum = smem[tid]; |
||||
|
||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); } |
||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); } |
||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; |
||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; |
||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; |
||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; |
||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; |
||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
sum = smem[0]; |
||||
|
||||
return sum; |
||||
} |
||||
|
||||
__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width, |
||||
__global float* block_hists, const float threshold, __local float *squares) |
||||
{ |
||||
const int tid = get_local_id(0); |
||||
const int gidX = get_group_id(0); |
||||
const int gidY = get_group_id(1); |
||||
|
||||
__global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid; |
||||
|
||||
float elem = 0.f; |
||||
if (tid < block_hist_size) |
||||
elem = hist[0]; |
||||
|
||||
squares[tid] = elem * elem; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
float sum = reduce_smem(squares, nthreads); |
||||
|
||||
float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size); |
||||
elem = min(elem * scale, threshold); |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
squares[tid] = elem * elem; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
sum = reduce_smem(squares, nthreads); |
||||
scale = 1.0f / (sqrt(sum) + 1e-3f); |
||||
|
||||
if (tid < block_hist_size) |
||||
hist[0] = elem * scale; |
||||
} |
||||
|
||||
//--------------------------------------------------------------------- |
||||
// Linear SVM based classification |
||||
// |
||||
__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width, |
||||
const int img_win_width, const int img_block_width, |
||||
const int win_block_stride_x, const int win_block_stride_y, |
||||
__global const float * block_hists, __global const float* coefs, |
||||
float free_coef, float threshold, __global uchar* labels) |
||||
{ |
||||
const int tid = get_local_id(0); |
||||
const int gidX = get_group_id(0); |
||||
const int gidY = get_group_id(1); |
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; |
||||
|
||||
float product = 0.f; |
||||
for (int i = tid; i < cdescr_size; i += NTHREADS) |
||||
{ |
||||
int offset_y = i / cdescr_width; |
||||
int offset_x = i - offset_y * cdescr_width; |
||||
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; |
||||
} |
||||
|
||||
__local float products[NTHREADS]; |
||||
|
||||
products[tid] = product; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 128) products[tid] = product = product + products[tid + 128]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 64) products[tid] = product = product + products[tid + 64]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
volatile __local float* smem = products; |
||||
smem[tid] = product = product + smem[tid + 32]; |
||||
smem[tid] = product = product + smem[tid + 16]; |
||||
smem[tid] = product = product + smem[tid + 8]; |
||||
smem[tid] = product = product + smem[tid + 4]; |
||||
smem[tid] = product = product + smem[tid + 2]; |
||||
smem[tid] = product = product + smem[tid + 1]; |
||||
} |
||||
|
||||
if (tid == 0) |
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); |
||||
} |
||||
|
||||
//---------------------------------------------------------------------------- |
||||
// Extract descriptors |
||||
|
||||
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, |
||||
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, |
||||
__global const float* block_hists, __global float* descriptors) |
||||
{ |
||||
int tid = get_local_id(0); |
||||
int gidX = get_group_id(0); |
||||
int gidY = get_group_id(1); |
||||
|
||||
// Get left top corner of the window in src |
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; |
||||
|
||||
// Get left top corner of the window in dst |
||||
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; |
||||
|
||||
// Copy elements from src to dst |
||||
for (int i = tid; i < cdescr_size; i += NTHREADS) |
||||
{ |
||||
int offset_y = i / cdescr_width; |
||||
int offset_x = i - offset_y * cdescr_width; |
||||
descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x]; |
||||
} |
||||
} |
||||
|
||||
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, |
||||
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, |
||||
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) |
||||
{ |
||||
int tid = get_local_id(0); |
||||
int gidX = get_group_id(0); |
||||
int gidY = get_group_id(1); |
||||
|
||||
// Get left top corner of the window in src |
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; |
||||
|
||||
// Get left top corner of the window in dst |
||||
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; |
||||
|
||||
// Copy elements from src to dst |
||||
for (int i = tid; i < cdescr_size; i += NTHREADS) |
||||
{ |
||||
int block_idx = i / cblock_hist_size; |
||||
int idx_in_block = i - block_idx * cblock_hist_size; |
||||
|
||||
int y = block_idx / cnblocks_win_x; |
||||
int x = block_idx - y * cnblocks_win_x; |
||||
|
||||
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; |
||||
} |
||||
} |
||||
|
||||
//---------------------------------------------------------------------------- |
||||
// Gradients computation |
||||
|
||||
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, |
||||
const __global uchar4 * img, __global float * grad, __global uchar * qangle, |
||||
const float angle_scale, const char correct_gamma, const int cnbins) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int tid = get_local_id(0); |
||||
const int gSizeX = get_local_size(0); |
||||
const int gidX = get_group_id(0); |
||||
const int gidY = get_group_id(1); |
||||
|
||||
__global const uchar4* row = img + gidY * img_step; |
||||
|
||||
__local float sh_row[(NTHREADS + 2) * 3]; |
||||
|
||||
uchar4 val; |
||||
if (x < width) |
||||
val = row[x]; |
||||
else |
||||
val = row[width - 2]; |
||||
|
||||
sh_row[tid + 1] = val.x; |
||||
sh_row[tid + 1 + (NTHREADS + 2)] = val.y; |
||||
sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z; |
||||
|
||||
if (tid == 0) |
||||
{ |
||||
val = row[max(x - 1, 1)]; |
||||
sh_row[0] = val.x; |
||||
sh_row[(NTHREADS + 2)] = val.y; |
||||
sh_row[2 * (NTHREADS + 2)] = val.z; |
||||
} |
||||
|
||||
if (tid == gSizeX - 1) |
||||
{ |
||||
val = row[min(x + 1, width - 2)]; |
||||
sh_row[gSizeX + 1] = val.x; |
||||
sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y; |
||||
sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if (x < width) |
||||
{ |
||||
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]); |
||||
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]); |
||||
|
||||
float3 dx; |
||||
if (correct_gamma == 1) |
||||
dx = sqrt(b) - sqrt(a); |
||||
else |
||||
dx = b - a; |
||||
|
||||
float3 dy = (float3) 0.f; |
||||
|
||||
if (gidY > 0 && gidY < height - 1) |
||||
{ |
||||
a = convert_float3(img[(gidY - 1) * img_step + x].xyz); |
||||
b = convert_float3(img[(gidY + 1) * img_step + x].xyz); |
||||
|
||||
if (correct_gamma == 1) |
||||
dy = sqrt(b) - sqrt(a); |
||||
else |
||||
dy = b - a; |
||||
} |
||||
|
||||
float best_dx = dx.x; |
||||
float best_dy = dy.x; |
||||
|
||||
float mag0 = dx.x * dx.x + dy.x * dy.x; |
||||
float mag1 = dx.y * dx.y + dy.y * dy.y; |
||||
if (mag0 < mag1) |
||||
{ |
||||
best_dx = dx.y; |
||||
best_dy = dy.y; |
||||
mag0 = mag1; |
||||
} |
||||
|
||||
mag1 = dx.z * dx.z + dy.z * dy.z; |
||||
if (mag0 < mag1) |
||||
{ |
||||
best_dx = dx.z; |
||||
best_dy = dy.z; |
||||
mag0 = mag1; |
||||
} |
||||
|
||||
mag0 = sqrt(mag0); |
||||
|
||||
float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; |
||||
int hidx = (int)floor(ang); |
||||
ang -= hidx; |
||||
hidx = (hidx + cnbins) % cnbins; |
||||
|
||||
qangle[(gidY * qangle_step + x) << 1] = hidx; |
||||
qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins; |
||||
grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang); |
||||
grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang; |
||||
} |
||||
} |
||||
|
||||
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, |
||||
__global const uchar * img, __global float * grad, __global uchar * qangle, |
||||
const float angle_scale, const char correct_gamma, const int cnbins) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int tid = get_local_id(0); |
||||
const int gSizeX = get_local_size(0); |
||||
const int gidX = get_group_id(0); |
||||
const int gidY = get_group_id(1); |
||||
|
||||
__global const uchar* row = img + gidY * img_step; |
||||
|
||||
__local float sh_row[NTHREADS + 2]; |
||||
|
||||
if (x < width) |
||||
sh_row[tid + 1] = row[x]; |
||||
else |
||||
sh_row[tid + 1] = row[width - 2]; |
||||
|
||||
if (tid == 0) |
||||
sh_row[0] = row[max(x - 1, 1)]; |
||||
|
||||
if (tid == gSizeX - 1) |
||||
sh_row[gSizeX + 1] = row[min(x + 1, width - 2)]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if (x < width) |
||||
{ |
||||
float dx; |
||||
|
||||
if (correct_gamma == 1) |
||||
dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]); |
||||
else |
||||
dx = sh_row[tid + 2] - sh_row[tid]; |
||||
|
||||
float dy = 0.f; |
||||
if (gidY > 0 && gidY < height - 1) |
||||
{ |
||||
float a = (float) img[ (gidY + 1) * img_step + x ]; |
||||
float b = (float) img[ (gidY - 1) * img_step + x ]; |
||||
if (correct_gamma == 1) |
||||
dy = sqrt(a) - sqrt(b); |
||||
else |
||||
dy = a - b; |
||||
} |
||||
float mag = sqrt(dx * dx + dy * dy); |
||||
|
||||
float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f; |
||||
int hidx = (int)floor(ang); |
||||
ang -= hidx; |
||||
hidx = (hidx + cnbins) % cnbins; |
||||
|
||||
qangle[ (gidY * qangle_step + x) << 1 ] = hidx; |
||||
qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins; |
||||
grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang); |
||||
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang; |
||||
} |
||||
} |
@ -0,0 +1,760 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#include <iomanip> |
||||
#include "precomp.hpp" |
||||
|
||||
|
||||
using namespace cv; |
||||
using namespace cv::ocl; |
||||
using namespace std; |
||||
|
||||
#if !defined (HAVE_OPENCL) |
||||
|
||||
cv::ocl::SURF_OCL::SURF_OCL() { throw_nogpu(); } |
||||
cv::ocl::SURF_OCL::SURF_OCL(double, int, int, bool, float, bool) { throw_nogpu(); } |
||||
int cv::ocl::SURF_OCL::descriptorSize() const { throw_nogpu(); return 0;} |
||||
void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>&, oclMat&) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat&, vector<KeyPoint>&) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat&, vector<float>&) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&, oclMat&, bool) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, oclMat&, bool) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); } |
||||
void cv::ocl::SURF_OCL::releaseMemory() { throw_nogpu(); } |
||||
|
||||
#else /* !defined (HAVE_OPENCL) */ |
||||
namespace cv { namespace ocl
|
||||
{ |
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
extern const char * nonfree_surf; |
||||
}} |
||||
|
||||
namespace
|
||||
{ |
||||
static inline int divUp(int total, int grain) |
||||
{ |
||||
return (total + grain - 1) / grain; |
||||
} |
||||
static inline int calcSize(int octave, int layer) |
||||
{ |
||||
/* Wavelet size at first layer of first octave. */ |
||||
const int HAAR_SIZE0 = 9; |
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd. |
||||
This ensures that when looking for the neighbours of a sample, the layers |
||||
|
||||
above and below are aligned correctly. */ |
||||
const int HAAR_SIZE_INC = 6; |
||||
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; |
||||
} |
||||
|
||||
class SURF_OCL_Invoker |
||||
{ |
||||
public: |
||||
// facilities
|
||||
void bindImgTex(const oclMat& img); |
||||
void bindSumTex(const oclMat& sum); |
||||
void bindMaskSumTex(const oclMat& maskSum); |
||||
|
||||
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
|
||||
// kernel callers declearations
|
||||
void icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int layer_rows); |
||||
|
||||
void icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset, |
||||
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols); |
||||
|
||||
void icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter, |
||||
oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures); |
||||
|
||||
void icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures); |
||||
|
||||
void compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures); |
||||
// end of kernel callers declearations
|
||||
|
||||
|
||||
SURF_OCL_Invoker(SURF_OCL& surf, const oclMat& img, const oclMat& mask) : |
||||
surf_(surf), |
||||
img_cols(img.cols), img_rows(img.rows), |
||||
use_mask(!mask.empty()) |
||||
{ |
||||
CV_Assert(!img.empty() && img.type() == CV_8UC1); |
||||
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); |
||||
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0); |
||||
|
||||
const int min_size = calcSize(surf_.nOctaves - 1, 0); |
||||
CV_Assert(img_rows - min_size >= 0); |
||||
CV_Assert(img_cols - min_size >= 0); |
||||
|
||||
const int layer_rows = img_rows >> (surf_.nOctaves - 1); |
||||
const int layer_cols = img_cols >> (surf_.nOctaves - 1); |
||||
const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1; |
||||
CV_Assert(layer_rows - 2 * min_margin > 0); |
||||
CV_Assert(layer_cols - 2 * min_margin > 0); |
||||
|
||||
maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535); |
||||
maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535); |
||||
|
||||
CV_Assert(maxFeatures > 0); |
||||
|
||||
counters.create(1, surf_.nOctaves + 1, CV_32SC1); |
||||
counters.setTo(Scalar::all(0)); |
||||
|
||||
//loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
||||
|
||||
bindImgTex(img); |
||||
oclMat integral_sqsum; |
||||
integral(img, surf_.sum, integral_sqsum); // the two argumented integral version is incorrect
|
||||
|
||||
bindSumTex(surf_.sum); |
||||
maskSumTex = 0; |
||||
|
||||
if (use_mask) |
||||
{ |
||||
throw std::exception(); |
||||
//!FIXME
|
||||
// temp fix for missing min overload
|
||||
oclMat temp(mask.size(), mask.type()); |
||||
temp.setTo(Scalar::all(1.0)); |
||||
//cv::ocl::min(mask, temp, surf_.mask1); ///////// disable this
|
||||
integral(surf_.mask1, surf_.maskSum); |
||||
bindMaskSumTex(surf_.maskSum); |
||||
} |
||||
} |
||||
|
||||
void detectKeypoints(oclMat& keypoints) |
||||
{ |
||||
// create image pyramid buffers
|
||||
// different layers have same sized buffers, but they are sampled from gaussin kernel.
|
||||
surf_.det.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1);
|
||||
surf_.trace.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1); |
||||
|
||||
surf_.maxPosBuffer.create(1, maxCandidates, CV_32SC4); |
||||
keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1); |
||||
keypoints.setTo(Scalar::all(0)); |
||||
|
||||
for (int octave = 0; octave < surf_.nOctaves; ++octave) |
||||
{ |
||||
const int layer_rows = img_rows >> octave; |
||||
const int layer_cols = img_cols >> octave; |
||||
|
||||
//loadOctaveConstants(octave, layer_rows, layer_cols);
|
||||
|
||||
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows); |
||||
|
||||
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave, |
||||
octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols); |
||||
|
||||
unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave); |
||||
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates)); |
||||
|
||||
if (maxCounter > 0) |
||||
{ |
||||
icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter, |
||||
keypoints, counters, octave, layer_rows, maxFeatures); |
||||
} |
||||
} |
||||
unsigned int featureCounter = Mat(counters).at<unsigned int>(0); |
||||
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures)); |
||||
|
||||
keypoints.cols = featureCounter; |
||||
|
||||
if (surf_.upright) |
||||
keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); |
||||
else |
||||
findOrientation(keypoints); |
||||
} |
||||
|
||||
void findOrientation(oclMat& keypoints) |
||||
{ |
||||
const int nFeatures = keypoints.cols; |
||||
if (nFeatures > 0) |
||||
{ |
||||
icvCalcOrientation_gpu(keypoints, nFeatures); |
||||
} |
||||
} |
||||
|
||||
void computeDescriptors(const oclMat& keypoints, oclMat& descriptors, int descriptorSize) |
||||
{ |
||||
const int nFeatures = keypoints.cols; |
||||
if (nFeatures > 0) |
||||
{ |
||||
descriptors.create(nFeatures, descriptorSize, CV_32F); |
||||
compute_descriptors_gpu(descriptors, keypoints, nFeatures); |
||||
} |
||||
} |
||||
|
||||
~SURF_OCL_Invoker() |
||||
{ |
||||
if(imgTex) |
||||
openCLFree(imgTex); |
||||
if(sumTex) |
||||
openCLFree(sumTex); |
||||
if(maskSumTex) |
||||
openCLFree(maskSumTex); |
||||
additioalParamBuffer.release(); |
||||
} |
||||
|
||||
private: |
||||
SURF_OCL& surf_; |
||||
|
||||
int img_cols, img_rows; |
||||
|
||||
bool use_mask; |
||||
|
||||
int maxCandidates; |
||||
int maxFeatures; |
||||
|
||||
oclMat counters; |
||||
|
||||
// texture buffers
|
||||
cl_mem imgTex; |
||||
cl_mem sumTex; |
||||
cl_mem maskSumTex; |
||||
|
||||
oclMat additioalParamBuffer; |
||||
}; |
||||
} |
||||
|
||||
cv::ocl::SURF_OCL::SURF_OCL() |
||||
{ |
||||
hessianThreshold = 100.0f; |
||||
extended = true; |
||||
nOctaves = 4; |
||||
nOctaveLayers = 2; |
||||
keypointsRatio = 0.01f; |
||||
upright = false; |
||||
} |
||||
|
||||
cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright) |
||||
{ |
||||
hessianThreshold = _threshold; |
||||
extended = _extended; |
||||
nOctaves = _nOctaves; |
||||
nOctaveLayers = _nOctaveLayers; |
||||
keypointsRatio = _keypointsRatio; |
||||
upright = _upright; |
||||
} |
||||
|
||||
int cv::ocl::SURF_OCL::descriptorSize() const |
||||
{ |
||||
return extended ? 128 : 64; |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>& keypoints, oclMat& keypointsGPU) |
||||
{ |
||||
if (keypoints.empty()) |
||||
keypointsGPU.release(); |
||||
else |
||||
{ |
||||
Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1); |
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW); |
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW); |
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW); |
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW); |
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW); |
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW); |
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW); |
||||
|
||||
for (size_t i = 0, size = keypoints.size(); i < size; ++i) |
||||
{ |
||||
const KeyPoint& kp = keypoints[i]; |
||||
kp_x[i] = kp.pt.x; |
||||
kp_y[i] = kp.pt.y; |
||||
kp_octave[i] = kp.octave; |
||||
kp_size[i] = kp.size; |
||||
kp_dir[i] = kp.angle; |
||||
kp_hessian[i] = kp.response; |
||||
kp_laplacian[i] = 1; |
||||
} |
||||
|
||||
keypointsGPU.upload(keypointsCPU); |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat& keypointsGPU, vector<KeyPoint>& keypoints) |
||||
{ |
||||
const int nFeatures = keypointsGPU.cols; |
||||
|
||||
if (nFeatures == 0) |
||||
keypoints.clear(); |
||||
else |
||||
{ |
||||
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT); |
||||
|
||||
Mat keypointsCPU(keypointsGPU); |
||||
|
||||
keypoints.resize(nFeatures); |
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW); |
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW); |
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW); |
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW); |
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW); |
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW); |
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW); |
||||
|
||||
for (int i = 0; i < nFeatures; ++i) |
||||
{ |
||||
KeyPoint& kp = keypoints[i]; |
||||
kp.pt.x = kp_x[i]; |
||||
kp.pt.y = kp_y[i]; |
||||
kp.class_id = kp_laplacian[i]; |
||||
kp.octave = kp_octave[i]; |
||||
kp.size = kp_size[i]; |
||||
kp.angle = kp_dir[i]; |
||||
kp.response = kp_hessian[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat& descriptorsGPU, vector<float>& descriptors) |
||||
{ |
||||
if (descriptorsGPU.empty()) |
||||
descriptors.clear(); |
||||
else |
||||
{ |
||||
CV_Assert(descriptorsGPU.type() == CV_32F); |
||||
|
||||
descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols); |
||||
Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]); |
||||
descriptorsGPU.download(descriptorsCPU); |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints) |
||||
{ |
||||
if (!img.empty()) |
||||
{ |
||||
SURF_OCL_Invoker surf(*this, img, mask); |
||||
|
||||
surf.detectKeypoints(keypoints); |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors, |
||||
bool useProvidedKeypoints) |
||||
{ |
||||
if (!img.empty()) |
||||
{ |
||||
SURF_OCL_Invoker surf(*this, img, mask); |
||||
|
||||
if (!useProvidedKeypoints) |
||||
surf.detectKeypoints(keypoints); |
||||
else if (!upright) |
||||
{ |
||||
surf.findOrientation(keypoints); |
||||
} |
||||
|
||||
surf.computeDescriptors(keypoints, descriptors, descriptorSize()); |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints) |
||||
{ |
||||
oclMat keypointsGPU; |
||||
|
||||
(*this)(img, mask, keypointsGPU); |
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints); |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints, |
||||
oclMat& descriptors, bool useProvidedKeypoints) |
||||
{ |
||||
oclMat keypointsGPU; |
||||
|
||||
if (useProvidedKeypoints) |
||||
uploadKeypoints(keypoints, keypointsGPU); |
||||
|
||||
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints); |
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints); |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints, |
||||
vector<float>& descriptors, bool useProvidedKeypoints) |
||||
{ |
||||
oclMat descriptorsGPU; |
||||
|
||||
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints); |
||||
|
||||
downloadDescriptors(descriptorsGPU, descriptors); |
||||
} |
||||
|
||||
void cv::ocl::SURF_OCL::releaseMemory() |
||||
{ |
||||
sum.release(); |
||||
mask1.release(); |
||||
maskSum.release(); |
||||
intBuffer.release(); |
||||
det.release(); |
||||
trace.release(); |
||||
maxPosBuffer.release(); |
||||
} |
||||
|
||||
// Facilities
|
||||
|
||||
//// load SURF constants into device memory
|
||||
//void SURF_OCL_Invoker::loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
|
||||
//{
|
||||
// Mat tmp(1, 9, CV_32FC1);
|
||||
// float * tmp_data = tmp.ptr<float>();
|
||||
// *tmp_data = maxCandidates;
|
||||
// *(++tmp_data) = maxFeatures;
|
||||
// *(++tmp_data) = img_rows;
|
||||
// *(++tmp_data) = img_cols;
|
||||
// *(++tmp_data) = nOctaveLayers;
|
||||
// *(++tmp_data) = hessianThreshold;
|
||||
// additioalParamBuffer = tmp;
|
||||
//}
|
||||
//void SURF_OCL_Invoker::loadOctaveConstants(int octave, int layer_rows, int layer_cols)
|
||||
//{
|
||||
// Mat tmp = additioalParamBuffer;
|
||||
// float * tmp_data = tmp.ptr<float>();
|
||||
// tmp_data += 6;
|
||||
// *tmp_data = octave;
|
||||
// *(++tmp_data) = layer_rows;
|
||||
// *(++tmp_data) = layer_cols;
|
||||
// additioalParamBuffer = tmp;
|
||||
//}
|
||||
|
||||
// create and bind source buffer to image oject.
|
||||
void SURF_OCL_Invoker::bindImgTex(const oclMat& img) |
||||
{ |
||||
Mat cpu_img(img); // time consuming
|
||||
cl_image_format format; |
||||
int err; |
||||
|
||||
format.image_channel_data_type = CL_UNSIGNED_INT8; |
||||
format.image_channel_order = CL_R; |
||||
|
||||
#if CL_VERSION_1_2 |
||||
cl_image_desc desc; |
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
||||
desc.image_width = cpu_img.cols; |
||||
desc.image_height = cpu_img.rows; |
||||
desc.image_depth = NULL; |
||||
desc.image_array_size = 1; |
||||
desc.image_row_pitch = cpu_img.step; |
||||
desc.image_slice_pitch= 0; |
||||
desc.buffer = NULL; |
||||
desc.num_mip_levels = 0; |
||||
desc.num_samples = 0; |
||||
imgTex = clCreateImage(img.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err); |
||||
#else |
||||
imgTex = clCreateImage2D( |
||||
img.clCxt->impl->clContext,
|
||||
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
|
||||
&format,
|
||||
cpu_img.cols,
|
||||
cpu_img.rows,
|
||||
cpu_img.step,
|
||||
cpu_img.data,
|
||||
&err); |
||||
#endif |
||||
openCLSafeCall(err); |
||||
} |
||||
|
||||
void SURF_OCL_Invoker::bindSumTex(const oclMat& sum) |
||||
{ |
||||
Mat cpu_img(sum); // time consuming
|
||||
cl_image_format format; |
||||
int err; |
||||
format.image_channel_data_type = CL_UNSIGNED_INT32; |
||||
format.image_channel_order = CL_R; |
||||
#if CL_VERSION_1_2 |
||||
cl_image_desc desc; |
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
||||
desc.image_width = cpu_img.cols; |
||||
desc.image_height = cpu_img.rows; |
||||
desc.image_depth = NULL; |
||||
desc.image_array_size = 1; |
||||
desc.image_row_pitch = cpu_img.step; |
||||
desc.image_slice_pitch= 0; |
||||
desc.buffer = NULL; |
||||
desc.num_mip_levels = 0; |
||||
desc.num_samples = 0; |
||||
sumTex = clCreateImage(sum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err); |
||||
#else |
||||
sumTex = clCreateImage2D( |
||||
sum.clCxt->impl->clContext,
|
||||
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
|
||||
&format,
|
||||
cpu_img.cols,
|
||||
cpu_img.rows,
|
||||
cpu_img.step,
|
||||
cpu_img.data,
|
||||
&err); |
||||
#endif |
||||
openCLSafeCall(err); |
||||
} |
||||
void SURF_OCL_Invoker::bindMaskSumTex(const oclMat& maskSum) |
||||
{ |
||||
Mat cpu_img(maskSum); // time consuming
|
||||
cl_image_format format; |
||||
int err; |
||||
format.image_channel_data_type = CL_UNSIGNED_INT32; |
||||
format.image_channel_order = CL_R; |
||||
#if CL_VERSION_1_2 |
||||
cl_image_desc desc; |
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
||||
desc.image_width = cpu_img.cols; |
||||
desc.image_height = cpu_img.rows; |
||||
desc.image_depth = NULL; |
||||
desc.image_array_size = 1; |
||||
desc.image_row_pitch = cpu_img.step; |
||||
desc.image_slice_pitch= 0; |
||||
desc.buffer = NULL; |
||||
desc.num_mip_levels = 0; |
||||
desc.num_samples = 0; |
||||
maskSumTex = clCreateImage(maskSum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err); |
||||
#else |
||||
maskSumTex = clCreateImage2D( |
||||
maskSum.clCxt->impl->clContext,
|
||||
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
|
||||
&format,
|
||||
cpu_img.cols,
|
||||
cpu_img.rows,
|
||||
cpu_img.step,
|
||||
cpu_img.data,
|
||||
&err); |
||||
#endif |
||||
openCLSafeCall(err); |
||||
} |
||||
|
||||
////////////////////////////
|
||||
// kernel caller definitions
|
||||
void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int c_layer_rows) |
||||
{ |
||||
const int min_size = calcSize(octave, 0); |
||||
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); |
||||
const int max_samples_j = 1 + ((img_cols - min_size) >> octave); |
||||
|
||||
Context *clCxt = det.clCxt; |
||||
string kernelName = "icvCalcLayerDetAndTrace"; |
||||
vector< pair<size_t, const void *> > args; |
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&octave)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows)); |
||||
|
||||
size_t localThreads[3] = {16, 16, 1}; |
||||
size_t globalThreads[3] = { |
||||
divUp(max_samples_j, localThreads[0]) * localThreads[0],
|
||||
divUp(max_samples_i, localThreads[1]) * localThreads[1] * (nOctaveLayers + 2),
|
||||
1}; |
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
|
||||
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset, |
||||
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols) |
||||
{ |
||||
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; |
||||
|
||||
Context *clCxt = det.clCxt; |
||||
string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer"; |
||||
vector< pair<size_t, const void *> > args; |
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&octave)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates)); |
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold)); |
||||
|
||||
if(use_mask) |
||||
{ |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex)); |
||||
} |
||||
|
||||
size_t localThreads[3] = {16, 16, 1}; |
||||
size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) * localThreads[0],
|
||||
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) * nLayers * localThreads[1],
|
||||
1}; |
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
|
||||
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter, |
||||
oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures) |
||||
{ |
||||
Context *clCxt = det.clCxt; |
||||
string kernelName = "icvInterpolateKeypoint"; |
||||
vector< pair<size_t, const void *> > args; |
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&octave)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures)); |
||||
|
||||
size_t localThreads[3] = {3, 3, 3}; |
||||
size_t globalThreads[3] = {maxCounter * localThreads[0], 1, 1}; |
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
|
||||
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures) |
||||
{ |
||||
Context * clCxt = counters.clCxt; |
||||
string kernelName = "icvCalcOrientation"; |
||||
|
||||
vector< pair<size_t, const void *> > args; |
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); |
||||
|
||||
size_t localThreads[3] = {32, 4, 1}; |
||||
size_t globalThreads[3] = {nFeatures * localThreads[0], localThreads[1], 1}; |
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
|
||||
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures) |
||||
{ |
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
Context *clCxt = descriptors.clCxt; |
||||
string kernelName = ""; |
||||
vector< pair<size_t, const void *> > args; |
||||
size_t localThreads[3] = {1, 1, 1}; |
||||
size_t globalThreads[3] = {1, 1, 1}; |
||||
|
||||
if(descriptors.cols == 64) |
||||
{ |
||||
kernelName = "compute_descriptors64"; |
||||
|
||||
localThreads[0] = 6; |
||||
localThreads[1] = 6; |
||||
|
||||
globalThreads[0] = nFeatures * localThreads[0]; |
||||
globalThreads[1] = 16 * localThreads[1]; |
||||
|
||||
args.clear(); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); |
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
|
||||
kernelName = "normalize_descriptors64"; |
||||
|
||||
localThreads[0] = 64; |
||||
localThreads[1] = 1; |
||||
|
||||
globalThreads[0] = nFeatures * localThreads[0]; |
||||
globalThreads[1] = localThreads[1]; |
||||
|
||||
args.clear(); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); |
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
else |
||||
{ |
||||
kernelName = "compute_descriptors128"; |
||||
|
||||
localThreads[0] = 6; |
||||
localThreads[1] = 6; |
||||
|
||||
globalThreads[0] = nFeatures * localThreads[0]; |
||||
globalThreads[1] = 16 * localThreads[1]; |
||||
|
||||
args.clear(); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); |
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
|
||||
kernelName = "normalize_descriptors128"; |
||||
|
||||
localThreads[0] = 128; |
||||
localThreads[1] = 1; |
||||
|
||||
globalThreads[0] = nFeatures * localThreads[0]; |
||||
globalThreads[1] = localThreads[1]; |
||||
|
||||
args.clear(); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); |
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
} |
||||
} |
||||
|
||||
#endif // /* !defined (HAVE_OPENCL) */
|
||||
|
@ -0,0 +1,192 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Wenju He, wenju@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencv2/core/core.hpp" |
||||
using namespace std; |
||||
#ifdef HAVE_OPENCL |
||||
|
||||
|
||||
PARAM_TEST_CASE(HOG,cv::Size,int) |
||||
{ |
||||
cv::Size winSize; |
||||
int type; |
||||
vector<cv::ocl::Info> info; |
||||
virtual void SetUp() |
||||
{ |
||||
winSize = GET_PARAM(0); |
||||
type = GET_PARAM(1); |
||||
cv::ocl::getDevice(info); |
||||
} |
||||
}; |
||||
|
||||
TEST_P(HOG, GetDescriptors) |
||||
{ |
||||
// Load image
|
||||
cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); |
||||
ASSERT_FALSE(img_rgb.empty()); |
||||
|
||||
// Convert image
|
||||
cv::Mat img; |
||||
switch (type) |
||||
{ |
||||
case CV_8UC1: |
||||
cv::cvtColor(img_rgb, img, CV_BGR2GRAY); |
||||
break; |
||||
case CV_8UC4: |
||||
default: |
||||
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); |
||||
break; |
||||
} |
||||
cv::ocl::oclMat d_img(img); |
||||
|
||||
// HOGs
|
||||
cv::ocl::HOGDescriptor ocl_hog; |
||||
ocl_hog.gamma_correction = true; |
||||
cv::HOGDescriptor hog; |
||||
hog.gammaCorrection = true; |
||||
|
||||
// Compute descriptor
|
||||
cv::ocl::oclMat d_descriptors; |
||||
ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL); |
||||
cv::Mat down_descriptors; |
||||
d_descriptors.download(down_descriptors); |
||||
down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows); |
||||
|
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector()); |
||||
std::vector<float> descriptors; |
||||
switch (type) |
||||
{ |
||||
case CV_8UC1: |
||||
hog.compute(img, descriptors, ocl_hog.win_size); |
||||
break; |
||||
case CV_8UC4: |
||||
default: |
||||
hog.compute(img_rgb, descriptors, ocl_hog.win_size); |
||||
break; |
||||
} |
||||
cv::Mat cpu_descriptors(descriptors); |
||||
|
||||
EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2); |
||||
} |
||||
|
||||
|
||||
TEST_P(HOG, Detect) |
||||
{ |
||||
// Load image
|
||||
cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); |
||||
ASSERT_FALSE(img_rgb.empty()); |
||||
|
||||
// Convert image
|
||||
cv::Mat img; |
||||
switch (type) |
||||
{ |
||||
case CV_8UC1: |
||||
cv::cvtColor(img_rgb, img, CV_BGR2GRAY); |
||||
break; |
||||
case CV_8UC4: |
||||
default: |
||||
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); |
||||
break; |
||||
} |
||||
cv::ocl::oclMat d_img(img); |
||||
|
||||
// HOGs
|
||||
if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128))) |
||||
winSize = cv::Size(64, 128); |
||||
cv::ocl::HOGDescriptor ocl_hog(winSize); |
||||
ocl_hog.gamma_correction = true; |
||||
|
||||
cv::HOGDescriptor hog; |
||||
hog.winSize = winSize; |
||||
hog.gammaCorrection = true; |
||||
|
||||
if (winSize.width == 48 && winSize.height == 96) |
||||
{ |
||||
// daimler's base
|
||||
ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96()); |
||||
hog.setSVMDetector(hog.getDaimlerPeopleDetector()); |
||||
} |
||||
else if (winSize.width == 64 && winSize.height == 128) |
||||
{ |
||||
ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128()); |
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector()); |
||||
} |
||||
else |
||||
{ |
||||
ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector()); |
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector()); |
||||
} |
||||
|
||||
// OpenCL detection
|
||||
std::vector<cv::Point> d_v_locations; |
||||
ocl_hog.detect(d_img, d_v_locations, 0); |
||||
cv::Mat d_locations(d_v_locations); |
||||
|
||||
// CPU detection
|
||||
std::vector<cv::Point> v_locations; |
||||
switch (type) |
||||
{ |
||||
case CV_8UC1: |
||||
hog.detect(img, v_locations, 0); |
||||
break; |
||||
case CV_8UC4: |
||||
default: |
||||
hog.detect(img_rgb, v_locations, 0); |
||||
break; |
||||
} |
||||
cv::Mat locations(v_locations); |
||||
|
||||
char s[100]={0}; |
||||
EXPECT_MAT_NEAR(d_locations, locations, 0, s); |
||||
} |
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine( |
||||
testing::Values(cv::Size(64, 128), cv::Size(48, 96)), |
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)))); |
||||
|
||||
|
||||
#endif //HAVE_OPENCL
|
Loading…
Reference in new issue