diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 5656e926ba..ee0989d643 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -74,6 +74,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -82,8 +85,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -93,7 +94,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_UnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -115,6 +116,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -123,7 +127,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -133,7 +136,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_Match"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -157,6 +160,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -167,8 +173,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -180,7 +184,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -197,6 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -207,7 +214,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -219,7 +225,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c std::string kernelName = "BruteForceMatch_RadiusMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -294,6 +300,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -302,8 +311,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -313,7 +320,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl std::string kernelName = "BruteForceMatch_knnUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -328,6 +335,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -336,7 +346,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -346,7 +355,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_knnMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index e76fb1d21e..7821920dc2 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -1,5 +1,58 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#define MAX_FLOAT 1e7f +#define MAX_FLOAT 3.40282e+038f + +#ifndef block_size +#define block_size 16 +#endif +#ifndef max_desc_len +#define max_desc_len 64 +#endif int bit1Count(float x) { @@ -15,7 +68,6 @@ int bit1Count(float x) float reduce_block(__local float *s_query, __local float *s_train, - int block_size, int lidx, int lidy, int distType @@ -51,8 +103,6 @@ float reduce_block(__local float *s_query, float reduce_multi_block(__local float *s_query, __local float *s_train, - int max_desc_len, - int block_size, int block_index, int lidx, int lidy, @@ -98,8 +148,6 @@ __kernel void BruteForceMatch_UnrollMatch_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -108,6 +156,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int distType ) { + const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); @@ -117,6 +166,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int queryIdx = groupidx * block_size + lidy; // load the query into local memory. + #pragma unroll for (int i = 0 ; i < max_desc_len / block_size; i ++) { int loadx = lidx + i * block_size; @@ -128,9 +178,10 @@ __kernel void BruteForceMatch_UnrollMatch_D5( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++) { float result = 0; + #pragma unroll for (int i = 0 ; i < max_desc_len / block_size ; i++) { //load a block_size * block_size block into local train. @@ -140,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -168,6 +219,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. + #pragma unroll for (int k = 0 ; k < block_size; k++) { if (myBestDistance > s_distance[k]) @@ -191,7 +243,6 @@ __kernel void BruteForceMatch_Match_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -232,7 +283,7 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -287,8 +338,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -322,7 +371,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -350,7 +399,6 @@ __kernel void BruteForceMatch_RadiusMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -384,7 +432,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -410,8 +458,6 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -455,7 +501,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -559,7 +605,6 @@ __kernel void BruteForceMatch_knnMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -600,7 +645,7 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -703,8 +748,6 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -721,7 +764,6 @@ kernel void BruteForceMatch_calcDistance_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -736,8 +778,7 @@ kernel void BruteForceMatch_findBestMatch_D5( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, - int k, - int block_size + int k ) { /* Todo */