Optimize bfmatcher by passing macros.

pull/807/head
peng xiao 12 years ago
parent 1e49c00f4b
commit 113b7584e0
  1. 39
      modules/ocl/src/brute_force_matcher.cpp
  2. 91
      modules/ocl/src/opencl/brute_force_match.cl

@ -74,6 +74,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
int m_size = MAX_DESC_LEN;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -82,8 +85,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -93,7 +94,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
std::string kernelName = "BruteForceMatch_UnrollMatch";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}
@ -115,6 +116,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
int block_size = BLOCK_SIZE;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d", block_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -123,7 +127,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -133,7 +136,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
std::string kernelName = "BruteForceMatch_Match";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}
@ -157,6 +160,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
int m_size = MAX_DESC_LEN;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -167,8 +173,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -180,7 +184,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
std::string kernelName = "BruteForceMatch_RadiusUnrollMatch";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}
@ -197,6 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
int block_size = BLOCK_SIZE;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d", block_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -207,7 +214,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -219,7 +225,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
std::string kernelName = "BruteForceMatch_RadiusMatch";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}
@ -294,6 +300,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
int m_size = MAX_DESC_LEN;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -302,8 +311,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -313,7 +320,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
std::string kernelName = "BruteForceMatch_knnUnrollMatch";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}
@ -328,6 +335,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
int block_size = BLOCK_SIZE;
vector< pair<size_t, const void *> > args;
static const int OPT_SIZE = 40;
char opt [OPT_SIZE] = "";
sprintf(opt, "-D block_size=%d", block_size);
if(globalSize[0] != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@ -336,7 +346,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@ -346,7 +355,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
std::string kernelName = "BruteForceMatch_knnMatch";
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
}
}

@ -1,5 +1,58 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#define MAX_FLOAT 1e7f
#define MAX_FLOAT 3.40282e+038f
#ifndef block_size
#define block_size 16
#endif
#ifndef max_desc_len
#define max_desc_len 64
#endif
int bit1Count(float x)
{
@ -15,7 +68,6 @@ int bit1Count(float x)
float reduce_block(__local float *s_query,
__local float *s_train,
int block_size,
int lidx,
int lidy,
int distType
@ -51,8 +103,6 @@ float reduce_block(__local float *s_query,
float reduce_multi_block(__local float *s_query,
__local float *s_train,
int max_desc_len,
int block_size,
int block_index,
int lidx,
int lidy,
@ -98,8 +148,6 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
@ -108,6 +156,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
@ -117,6 +166,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
int queryIdx = groupidx * block_size + lidy;
// load the query into local memory.
#pragma unroll
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
@ -128,9 +178,10 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
// loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++)
{
float result = 0;
#pragma unroll
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
//load a block_size * block_size block into local train.
@ -140,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -168,6 +219,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
#pragma unroll
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
@ -191,7 +243,6 @@ __kernel void BruteForceMatch_Match_D5(
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
@ -232,7 +283,7 @@ __kernel void BruteForceMatch_Match_D5(
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
result += reduce_block(s_query, s_train, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -287,8 +338,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
@ -322,7 +371,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
result += reduce_block(s_query, s_train, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -350,7 +399,6 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
@ -384,7 +432,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
result += reduce_block(s_query, s_train, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -410,8 +458,6 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
@ -455,7 +501,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -559,7 +605,6 @@ __kernel void BruteForceMatch_knnMatch_D5(
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
@ -600,7 +645,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
result += reduce_block(s_query, s_train, lidx, lidy, distType);
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -703,8 +748,6 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
@ -721,7 +764,6 @@ kernel void BruteForceMatch_calcDistance_D5(
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
@ -736,8 +778,7 @@ kernel void BruteForceMatch_findBestMatch_D5(
__global float *allDist,
__global int *bestTrainIdx,
__global float *bestDistance,
int k,
int block_size
int k
)
{
/* Todo */

Loading…
Cancel
Save