mirror of https://github.com/opencv/opencv.git
Merge pull request #1257 from SpecLad:merge-2.4
commit
2349f7ea89
99 changed files with 2217 additions and 373 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,176 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable |
||||
|
||||
#ifndef N // number of radices |
||||
#define N 4 |
||||
#endif |
||||
|
||||
#ifndef K_T |
||||
#define K_T float |
||||
#endif |
||||
|
||||
#ifndef V_T |
||||
#define V_T float |
||||
#endif |
||||
|
||||
#ifndef IS_GT |
||||
#define IS_GT 0 |
||||
#endif |
||||
|
||||
|
||||
// from Thrust::b40c, link: |
||||
// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h |
||||
__inline uint convertKey(uint converted_key) |
||||
{ |
||||
#ifdef K_FLT |
||||
unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000; |
||||
converted_key ^= mask; |
||||
#elif defined(K_INT) |
||||
const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1); |
||||
converted_key ^= SIGN_MASK; |
||||
#else |
||||
|
||||
#endif |
||||
return converted_key; |
||||
} |
||||
|
||||
//FIXME(pengx17): |
||||
// exclusive scan, need to be optimized as this is too naive... |
||||
kernel |
||||
void naiveScanAddition( |
||||
__global int * input, |
||||
__global int * output, |
||||
int size |
||||
) |
||||
{ |
||||
if(get_global_id(0) == 0) |
||||
{ |
||||
output[0] = 0; |
||||
for(int i = 1; i < size; i ++) |
||||
{ |
||||
output[i] = output[i - 1] + input[i - 1]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// following is ported from |
||||
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl |
||||
kernel |
||||
void histogramRadixN ( |
||||
__global K_T* unsortedKeys, |
||||
__global int * buckets, |
||||
uint shiftCount |
||||
) |
||||
{ |
||||
const int RADIX_T = N; |
||||
const int RADICES_T = (1 << RADIX_T); |
||||
const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; |
||||
const int MASK_T = (1 << RADIX_T) - 1; |
||||
int localBuckets[16] = {0,0,0,0,0,0,0,0, |
||||
0,0,0,0,0,0,0,0}; |
||||
int globalId = get_global_id(0); |
||||
int numOfGroups = get_num_groups(0); |
||||
|
||||
/* Calculate thread-histograms */ |
||||
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) |
||||
{ |
||||
uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)])); |
||||
value = (value >> shiftCount) & MASK_T; |
||||
#if IS_GT |
||||
localBuckets[RADICES_T - value - 1]++; |
||||
#else |
||||
localBuckets[value]++; |
||||
#endif |
||||
} |
||||
|
||||
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) |
||||
{ |
||||
buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i]; |
||||
} |
||||
} |
||||
|
||||
kernel |
||||
void permuteRadixN ( |
||||
__global K_T* unsortedKeys, |
||||
__global V_T* unsortedVals, |
||||
__global int* scanedBuckets, |
||||
uint shiftCount, |
||||
__global K_T* sortedKeys, |
||||
__global V_T* sortedVals |
||||
) |
||||
{ |
||||
const int RADIX_T = N; |
||||
const int RADICES_T = (1 << RADIX_T); |
||||
const int MASK_T = (1<<RADIX_T) -1; |
||||
|
||||
int globalId = get_global_id(0); |
||||
int numOfGroups = get_num_groups(0); |
||||
const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N; |
||||
int localIndex[16]; |
||||
|
||||
/*Load the index to local memory*/ |
||||
for(int i = 0; i < RADICES_T; ++i) |
||||
{ |
||||
#if IS_GT |
||||
localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)]; |
||||
#else |
||||
localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)]; |
||||
#endif |
||||
} |
||||
/* Permute elements to appropriate location */ |
||||
for(int i = 0; i < RADICES_T; ++i) |
||||
{ |
||||
int old_idx = mad24(globalId, RADICES_T, i); |
||||
K_T ovalue = unsortedKeys[old_idx]; |
||||
uint value = convertKey(as_uint(ovalue)); |
||||
uint maskedValue = (value >> shiftCount) & MASK_T; |
||||
uint index = localIndex[maskedValue]; |
||||
sortedKeys[index] = ovalue; |
||||
sortedVals[index] = unsortedVals[old_idx]; |
||||
localIndex[maskedValue] = index + 1; |
||||
} |
||||
} |
@ -0,0 +1,245 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#ifndef K_T |
||||
#define K_T float |
||||
#endif |
||||
|
||||
#ifndef V_T |
||||
#define V_T float |
||||
#endif |
||||
|
||||
#ifndef IS_GT |
||||
#define IS_GT false |
||||
#endif |
||||
|
||||
#if IS_GT |
||||
#define my_comp(x,y) ((x) > (y)) |
||||
#else |
||||
#define my_comp(x,y) ((x) < (y)) |
||||
#endif |
||||
|
||||
/////////////////////// Bitonic sort //////////////////////////// |
||||
// ported from |
||||
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl |
||||
__kernel |
||||
void bitonicSort |
||||
( |
||||
__global K_T * keys, |
||||
__global V_T * vals, |
||||
int count, |
||||
int stage, |
||||
int passOfStage |
||||
) |
||||
{ |
||||
const int threadId = get_global_id(0); |
||||
if(threadId >= count / 2) |
||||
{ |
||||
return; |
||||
} |
||||
const int pairDistance = 1 << (stage - passOfStage); |
||||
const int blockWidth = 2 * pairDistance; |
||||
|
||||
int leftId = min( (threadId % pairDistance) |
||||
+ (threadId / pairDistance) * blockWidth, count ); |
||||
|
||||
int rightId = min( leftId + pairDistance, count ); |
||||
|
||||
int temp; |
||||
|
||||
const V_T lval = vals[leftId]; |
||||
const V_T rval = vals[rightId]; |
||||
|
||||
const K_T lkey = keys[leftId]; |
||||
const K_T rkey = keys[rightId]; |
||||
|
||||
int sameDirectionBlockWidth = 1 << stage; |
||||
|
||||
if((threadId/sameDirectionBlockWidth) % 2 == 1) |
||||
{ |
||||
temp = rightId; |
||||
rightId = leftId; |
||||
leftId = temp; |
||||
} |
||||
|
||||
const bool compareResult = my_comp(lkey, rkey); |
||||
|
||||
if(compareResult) |
||||
{ |
||||
keys[rightId] = rkey; |
||||
keys[leftId] = lkey; |
||||
vals[rightId] = rval; |
||||
vals[leftId] = lval; |
||||
} |
||||
else |
||||
{ |
||||
keys[rightId] = lkey; |
||||
keys[leftId] = rkey; |
||||
vals[rightId] = lval; |
||||
vals[leftId] = rval; |
||||
} |
||||
} |
||||
|
||||
/////////////////////// Selection sort //////////////////////////// |
||||
//kernel is ported from Bolt library: |
||||
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl |
||||
__kernel |
||||
void selectionSortLocal |
||||
( |
||||
__global K_T * keys, |
||||
__global V_T * vals, |
||||
const int count, |
||||
__local K_T * scratch |
||||
) |
||||
{ |
||||
int i = get_local_id(0); // index in workgroup |
||||
int numOfGroups = get_num_groups(0); // index in workgroup |
||||
int groupID = get_group_id(0); |
||||
int wg = get_local_size(0); // workgroup size = block size |
||||
int n; // number of elements to be processed for this work group |
||||
|
||||
int offset = groupID * wg; |
||||
int same = 0; |
||||
|
||||
vals += offset; |
||||
keys += offset; |
||||
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; |
||||
|
||||
int clamped_i= min(i, n - 1); |
||||
|
||||
K_T key1 = keys[clamped_i], key2; |
||||
V_T val1 = vals[clamped_i]; |
||||
scratch[i] = key1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(i >= n) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
int pos = 0; |
||||
for (int j=0;j<n;++j) |
||||
{ |
||||
key2 = scratch[j]; |
||||
if(my_comp(key2, key1)) |
||||
pos++;//calculate the rank of this element in this work group |
||||
else |
||||
{ |
||||
if(my_comp(key1, key2)) |
||||
continue; |
||||
else |
||||
{ |
||||
// key1 and key2 are same |
||||
same++; |
||||
} |
||||
} |
||||
} |
||||
for (int j=0; j< same; j++) |
||||
{ |
||||
vals[pos + j] = val1; |
||||
keys[pos + j] = key1; |
||||
} |
||||
} |
||||
__kernel |
||||
void selectionSortFinal |
||||
( |
||||
__global K_T * keys, |
||||
__global V_T * vals, |
||||
const int count |
||||
) |
||||
{ |
||||
const int i = get_local_id(0); // index in workgroup |
||||
const int numOfGroups = get_num_groups(0); // index in workgroup |
||||
const int groupID = get_group_id(0); |
||||
const int wg = get_local_size(0); // workgroup size = block size |
||||
int pos = 0, same = 0; |
||||
const int offset = get_group_id(0) * wg; |
||||
const int remainder = count - wg*(numOfGroups-1); |
||||
|
||||
if((offset + i ) >= count) |
||||
return; |
||||
V_T val1 = vals[offset + i]; |
||||
|
||||
K_T key1 = keys[offset + i]; |
||||
K_T key2; |
||||
|
||||
for(int j=0; j<numOfGroups-1; j++ ) |
||||
{ |
||||
for(int k=0; k<wg; k++) |
||||
{ |
||||
key2 = keys[j*wg + k]; |
||||
if(my_comp(key1, key2)) |
||||
break; |
||||
else |
||||
{ |
||||
//Increment only if the value is not the same. |
||||
if(my_comp(key2, key1)) |
||||
pos++; |
||||
else |
||||
same++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
for(int k=0; k<remainder; k++) |
||||
{ |
||||
key2 = keys[(numOfGroups-1)*wg + k]; |
||||
if(my_comp(key1, key2)) |
||||
break; |
||||
else |
||||
{ |
||||
//Don't increment if the value is the same. |
||||
if(my_comp(key2, key1)) |
||||
pos++; |
||||
else |
||||
same++; |
||||
} |
||||
} |
||||
for (int j=0; j< same; j++) |
||||
{ |
||||
vals[pos + j] = val1; |
||||
keys[pos + j] = key1; |
||||
} |
||||
} |
@ -0,0 +1,296 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#ifndef K_T |
||||
#define K_T float |
||||
#endif |
||||
|
||||
#ifndef V_T |
||||
#define V_T float |
||||
#endif |
||||
|
||||
#ifndef IS_GT |
||||
#define IS_GT false |
||||
#endif |
||||
|
||||
#if IS_GT |
||||
#define my_comp(x,y) ((x) > (y)) |
||||
#else |
||||
#define my_comp(x,y) ((x) < (y)) |
||||
#endif |
||||
|
||||
///////////// parallel merge sort /////////////// |
||||
// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl |
||||
uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) |
||||
{ |
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence |
||||
uint firstIndex = left; |
||||
uint lastIndex = right; |
||||
|
||||
// This loops through [firstIndex, lastIndex) |
||||
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch, |
||||
// this while loop will be divergent within a wavefront |
||||
while( firstIndex < lastIndex ) |
||||
{ |
||||
K_T dataVal = data[ firstIndex ]; |
||||
|
||||
// This branch will create divergent wavefronts |
||||
if( my_comp( dataVal, searchVal ) ) |
||||
{ |
||||
firstIndex = firstIndex+1; |
||||
} |
||||
else |
||||
{ |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return firstIndex; |
||||
} |
||||
|
||||
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted |
||||
// by a base pointer and left and right index for a particular candidate value. The comparison operator is |
||||
// passed as a functor parameter my_comp |
||||
// This function returns an index that is the first index whos value would be equal to the searched value |
||||
uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) |
||||
{ |
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence |
||||
uint firstIndex = left; |
||||
uint lastIndex = right; |
||||
|
||||
// This loops through [firstIndex, lastIndex) |
||||
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch, |
||||
// this while loop will be divergent within a wavefront |
||||
while( firstIndex < lastIndex ) |
||||
{ |
||||
// midIndex is the average of first and last, rounded down |
||||
uint midIndex = ( firstIndex + lastIndex ) / 2; |
||||
K_T midValue = data[ midIndex ]; |
||||
|
||||
// This branch will create divergent wavefronts |
||||
if( my_comp( midValue, searchVal ) ) |
||||
{ |
||||
firstIndex = midIndex+1; |
||||
// printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex ); |
||||
} |
||||
else |
||||
{ |
||||
lastIndex = midIndex; |
||||
// printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex ); |
||||
} |
||||
} |
||||
|
||||
return firstIndex; |
||||
} |
||||
|
||||
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted |
||||
// by a base pointer and left and right index for a particular candidate value. The comparison operator is |
||||
// passed as a functor parameter my_comp |
||||
// This function returns an index that is the first index whos value would be greater than the searched value |
||||
// If the search value is not found in the sequence, upperbound returns the same result as lowerbound |
||||
uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) |
||||
{ |
||||
uint upperBound = lowerBoundBinary( data, left, right, searchVal ); |
||||
|
||||
// printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound ); |
||||
// If upperBound == right, then searchVal was not found in the sequence. Just return. |
||||
if( upperBound != right ) |
||||
{ |
||||
// While the values are equal i.e. !(x < y) && !(y < x) increment the index |
||||
K_T upperValue = data[ upperBound ]; |
||||
while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) ) |
||||
{ |
||||
upperBound++; |
||||
upperValue = data[ upperBound ]; |
||||
} |
||||
} |
||||
|
||||
return upperBound; |
||||
} |
||||
|
||||
// This kernel implements merging of blocks of sorted data. The input to this kernel most likely is |
||||
// the output of blockInsertionSortTemplate. It is expected that the source array contains multiple |
||||
// blocks, each block is independently sorted. The goal is to write into the output buffer half as |
||||
// many blocks, of double the size. The even and odd blocks are stably merged together to form |
||||
// a new sorted block of twice the size. The algorithm is out-of-place. |
||||
kernel void merge( |
||||
global K_T* iKey_ptr, |
||||
global V_T* iValue_ptr, |
||||
global K_T* oKey_ptr, |
||||
global V_T* oValue_ptr, |
||||
const uint srcVecSize, |
||||
const uint srcLogicalBlockSize, |
||||
local K_T* key_lds, |
||||
local V_T* val_lds |
||||
) |
||||
{ |
||||
size_t globalID = get_global_id( 0 ); |
||||
size_t groupID = get_group_id( 0 ); |
||||
size_t localID = get_local_id( 0 ); |
||||
size_t wgSize = get_local_size( 0 ); |
||||
|
||||
// Abort threads that are passed the end of the input vector |
||||
if( globalID >= srcVecSize ) |
||||
return; // on SI this doesn't mess-up barriers |
||||
|
||||
// For an element in sequence A, find the lowerbound index for it in sequence B |
||||
uint srcBlockNum = globalID / srcLogicalBlockSize; |
||||
uint srcBlockIndex = globalID % srcLogicalBlockSize; |
||||
|
||||
// printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex ); |
||||
|
||||
// Pairs of even-odd blocks will be merged together |
||||
// An even block should search for an insertion point in the next odd block, |
||||
// and the odd block should look for an insertion point in the corresponding previous even block |
||||
uint dstLogicalBlockSize = srcLogicalBlockSize<<1; |
||||
uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 ); |
||||
leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize; |
||||
leftBlockIndex = min( leftBlockIndex, srcVecSize ); |
||||
uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize ); |
||||
|
||||
// if( localID == 0 ) |
||||
// { |
||||
// printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex ); |
||||
// } |
||||
|
||||
// For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex |
||||
// uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex; |
||||
uint insertionIndex = 0; |
||||
if( (srcBlockNum & 0x1) == 0 ) |
||||
{ |
||||
insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; |
||||
} |
||||
else |
||||
{ |
||||
insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; |
||||
} |
||||
|
||||
// The index of an element in the result sequence is the summation of it's indixes in the two input |
||||
// sequences |
||||
uint dstBlockIndex = srcBlockIndex + insertionIndex; |
||||
uint dstBlockNum = srcBlockNum/2; |
||||
|
||||
// if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 ) |
||||
// { |
||||
// printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex ); |
||||
// printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] ); |
||||
// } |
||||
oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ]; |
||||
oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ]; |
||||
// printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex ); |
||||
} |
||||
|
||||
kernel void blockInsertionSort( |
||||
global K_T* key_ptr, |
||||
global V_T* value_ptr, |
||||
const uint vecSize, |
||||
local K_T* key_lds, |
||||
local V_T* val_lds |
||||
) |
||||
{ |
||||
size_t gloId = get_global_id( 0 ); |
||||
size_t groId = get_group_id( 0 ); |
||||
size_t locId = get_local_id( 0 ); |
||||
size_t wgSize = get_local_size( 0 ); |
||||
|
||||
bool in_range = gloId < vecSize; |
||||
K_T key; |
||||
V_T val; |
||||
// Abort threads that are passed the end of the input vector |
||||
if (in_range) |
||||
{ |
||||
// Make a copy of the entire input array into fast local memory |
||||
key = key_ptr[ gloId ]; |
||||
val = value_ptr[ gloId ]; |
||||
key_lds[ locId ] = key; |
||||
val_lds[ locId ] = val; |
||||
} |
||||
barrier( CLK_LOCAL_MEM_FENCE ); |
||||
// Sorts a workgroup using a naive insertion sort |
||||
// The sort uses one thread within a workgroup to sort the entire workgroup |
||||
if( locId == 0 && in_range ) |
||||
{ |
||||
// The last workgroup may have an irregular size, so we calculate a per-block endIndex |
||||
// endIndex is essentially emulating a mod operator with subtraction and multiply |
||||
size_t endIndex = vecSize - ( groId * wgSize ); |
||||
endIndex = min( endIndex, wgSize ); |
||||
|
||||
// printf( "Debug: endIndex[%i]=%i\n", groId, endIndex ); |
||||
|
||||
// Indices are signed because the while loop will generate a -1 index inside of the max function |
||||
for( int currIndex = 1; currIndex < endIndex; ++currIndex ) |
||||
{ |
||||
key = key_lds[ currIndex ]; |
||||
val = val_lds[ currIndex ]; |
||||
int scanIndex = currIndex; |
||||
K_T ldsKey = key_lds[scanIndex - 1]; |
||||
while( scanIndex > 0 && my_comp( key, ldsKey ) ) |
||||
{ |
||||
V_T ldsVal = val_lds[scanIndex - 1]; |
||||
|
||||
// If the keys are being swapped, make sure the values are swapped identicaly |
||||
key_lds[ scanIndex ] = ldsKey; |
||||
val_lds[ scanIndex ] = ldsVal; |
||||
|
||||
scanIndex = scanIndex - 1; |
||||
ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1 |
||||
} |
||||
key_lds[ scanIndex ] = key; |
||||
val_lds[ scanIndex ] = val; |
||||
} |
||||
} |
||||
barrier( CLK_LOCAL_MEM_FENCE ); |
||||
|
||||
if(in_range) |
||||
{ |
||||
key = key_lds[ locId ]; |
||||
key_ptr[ gloId ] = key; |
||||
|
||||
val = val_lds[ locId ]; |
||||
value_ptr[ gloId ] = val; |
||||
} |
||||
} |
||||
|
||||
///////////// Radix sort from b40c library ///////////// |
@ -0,0 +1,454 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include <iomanip> |
||||
#include "precomp.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
|
||||
extern const char * kernel_sort_by_key; |
||||
extern const char * kernel_stablesort_by_key; |
||||
extern const char * kernel_radix_sort_by_key; |
||||
|
||||
void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan); |
||||
|
||||
//TODO(pengx17): change this value depending on device other than a constant
|
||||
const static unsigned int GROUP_SIZE = 256; |
||||
|
||||
const char * depth_strings[] = |
||||
{ |
||||
"uchar", //CV_8U
|
||||
"char", //CV_8S
|
||||
"ushort", //CV_16U
|
||||
"short", //CV_16S
|
||||
"int", //CV_32S
|
||||
"float", //CV_32F
|
||||
"double" //CV_64F
|
||||
}; |
||||
|
||||
void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf) |
||||
{ |
||||
sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s", |
||||
isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]); |
||||
if(vals.oclchannels() > 1) |
||||
{ |
||||
sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels()); |
||||
} |
||||
} |
||||
inline bool isSizePowerOf2(size_t size) |
||||
{ |
||||
return ((size - 1) & (size)) == 0; |
||||
} |
||||
|
||||
namespace bitonic_sort |
||||
{ |
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||
{ |
||||
CV_Assert(isSizePowerOf2(vecSize)); |
||||
|
||||
Context * cxt = Context::getContext(); |
||||
size_t globalThreads[3] = {vecSize / 2, 1, 1}; |
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||
|
||||
// 2^numStages should be equal to vecSize or the output is invalid
|
||||
int numStages = 0; |
||||
for(int i = vecSize; i > 1; i >>= 1) |
||||
{ |
||||
++numStages; |
||||
} |
||||
char build_opt_buf [100]; |
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||
const int argc = 5; |
||||
std::vector< std::pair<size_t, const void *> > args(argc); |
||||
String kernelname = "bitonicSort"; |
||||
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||
args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize); |
||||
|
||||
for(int stage = 0; stage < numStages; ++stage) |
||||
{ |
||||
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); |
||||
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) |
||||
{ |
||||
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); |
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||
} |
||||
} |
||||
} |
||||
} /* bitonic_sort */ |
||||
|
||||
namespace selection_sort |
||||
{ |
||||
// FIXME:
|
||||
// This function cannot sort arrays with duplicated keys
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||
{ |
||||
CV_Error(-1, "This function is incorrect at the moment."); |
||||
Context * cxt = Context::getContext(); |
||||
|
||||
size_t globalThreads[3] = {vecSize, 1, 1}; |
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||
|
||||
std::vector< std::pair<size_t, const void *> > args; |
||||
char build_opt_buf [100]; |
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||
|
||||
//local
|
||||
String kernelname = "selectionSortLocal"; |
||||
int lds_size = GROUP_SIZE * keys.elemSize(); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize)); |
||||
args.push_back(std::make_pair(lds_size, (void*)NULL)); |
||||
|
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||
|
||||
//final
|
||||
kernelname = "selectionSortFinal"; |
||||
args.pop_back(); |
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||
} |
||||
|
||||
} /* selection_sort */ |
||||
|
||||
|
||||
namespace radix_sort |
||||
{ |
||||
//FIXME(pengx17):
|
||||
// exclusive scan, need to be optimized as this is too naive...
|
||||
//void naive_scan_addition(oclMat& input, oclMat& output)
|
||||
//{
|
||||
// Context * cxt = Context::getContext();
|
||||
// size_t vecSize = input.cols;
|
||||
// size_t globalThreads[3] = {1, 1, 1};
|
||||
// size_t localThreads[3] = {1, 1, 1};
|
||||
//
|
||||
// String kernelname = "naiveScanAddition";
|
||||
//
|
||||
// std::vector< std::pair<size_t, const void *> > args;
|
||||
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
|
||||
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
|
||||
// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
|
||||
// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||
//}
|
||||
|
||||
void static naive_scan_addition_cpu(oclMat& input, oclMat& output) |
||||
{ |
||||
Mat m_input = input, m_output(output.size(), output.type()); |
||||
MatIterator_<int> i_mit = m_input.begin<int>(); |
||||
MatIterator_<int> o_mit = m_output.begin<int>(); |
||||
*o_mit = 0; |
||||
++i_mit; |
||||
++o_mit; |
||||
for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit) |
||||
{ |
||||
*o_mit = *(o_mit - 1) + *(i_mit - 1); |
||||
} |
||||
output = m_output; |
||||
} |
||||
|
||||
|
||||
//radix sort ported from Bolt
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan) |
||||
{ |
||||
CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
|
||||
|
||||
bool isKeyFloat = keys.type() == CV_32F; |
||||
|
||||
const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
|
||||
//local array of 16 elements in the histogram kernel.
|
||||
const int RADICES = (1 << RADIX); //Values handeled by each work-item?
|
||||
|
||||
bool newBuffer = false; |
||||
size_t vecSize = origVecSize; |
||||
|
||||
unsigned int groupSize = RADICES; |
||||
|
||||
size_t mulFactor = groupSize * RADICES; |
||||
|
||||
oclMat buffer_keys, buffer_vals; |
||||
|
||||
if(origVecSize % mulFactor != 0) |
||||
{ |
||||
vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor; |
||||
buffer_keys.create(1, vecSize, keys.type()); |
||||
buffer_vals.create(1, vecSize, vals.type()); |
||||
Scalar padding_value; |
||||
oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1)); |
||||
|
||||
if(isGreaterThan) |
||||
{ |
||||
switch(buffer_keys.depth()) |
||||
{ |
||||
case CV_32F: |
||||
padding_value = Scalar::all(-FLT_MAX); |
||||
break; |
||||
case CV_32S: |
||||
padding_value = Scalar::all(INT_MIN); |
||||
break; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
switch(buffer_keys.depth()) |
||||
{ |
||||
case CV_32F: |
||||
padding_value = Scalar::all(FLT_MAX); |
||||
break; |
||||
case CV_32S: |
||||
padding_value = Scalar::all(INT_MAX); |
||||
break; |
||||
} |
||||
} |
||||
ocl::copyMakeBorder( |
||||
keys(Rect(0,0,origVecSize,1)), buffer_keys,
|
||||
0, 0, 0, vecSize - origVecSize,
|
||||
BORDER_CONSTANT, padding_value); |
||||
vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals); |
||||
newBuffer = true; |
||||
} |
||||
else |
||||
{ |
||||
buffer_keys = keys; |
||||
buffer_vals = vals; |
||||
newBuffer = false; |
||||
} |
||||
oclMat swap_input_keys(1, vecSize, keys.type()); |
||||
oclMat swap_input_vals(1, vecSize, vals.type()); |
||||
oclMat hist_bin_keys(1, vecSize, CV_32SC1); |
||||
oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1); |
||||
|
||||
Context * cxt = Context::getContext(); |
||||
|
||||
size_t globalThreads[3] = {vecSize / RADICES, 1, 1}; |
||||
size_t localThreads[3] = {groupSize, 1, 1}; |
||||
|
||||
std::vector< std::pair<size_t, const void *> > args; |
||||
char build_opt_buf [100]; |
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||
|
||||
//additional build option for radix sort
|
||||
sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
|
||||
|
||||
String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")}; |
||||
|
||||
int swap = 0; |
||||
for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX) |
||||
{ |
||||
args.clear(); |
||||
//Do a histogram pass locally
|
||||
if(swap == 0) |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||
} |
||||
else |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||
} |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); |
||||
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads, |
||||
args, -1, -1, build_opt_buf); |
||||
|
||||
args.clear(); |
||||
//Perform a global scan
|
||||
naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys); |
||||
// end of scan
|
||||
if(swap == 0) |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); |
||||
} |
||||
else |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); |
||||
} |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); |
||||
|
||||
if(swap == 0) |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); |
||||
} |
||||
else |
||||
{ |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); |
||||
} |
||||
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads, |
||||
args, -1, -1, build_opt_buf); |
||||
swap = swap ? 0 : 1; |
||||
} |
||||
if(newBuffer) |
||||
{ |
||||
buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys); |
||||
buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals); |
||||
} |
||||
} |
||||
|
||||
} /* radix_sort */ |
||||
|
||||
namespace merge_sort |
||||
{ |
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||
{ |
||||
Context * cxt = Context::getContext(); |
||||
|
||||
size_t globalThreads[3] = {vecSize, 1, 1}; |
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||
|
||||
std::vector< std::pair<size_t, const void *> > args; |
||||
char build_opt_buf [100]; |
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||
|
||||
String kernelname[] = {String("blockInsertionSort"), String("merge")}; |
||||
int keylds_size = GROUP_SIZE * keys.elemSize(); |
||||
int vallds_size = GROUP_SIZE * vals.elemSize(); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); |
||||
args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize)); |
||||
args.push_back(std::make_pair(keylds_size, (void*)NULL)); |
||||
args.push_back(std::make_pair(vallds_size, (void*)NULL)); |
||||
|
||||
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||
|
||||
// Early exit for the case of no merge passes, values are already in destination vector
|
||||
if(vecSize <= GROUP_SIZE) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
// An odd number of elements requires an extra merge pass to sort
|
||||
size_t numMerges = 0; |
||||
// Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
|
||||
// this is how many merge passes we want
|
||||
size_t log2BlockSize = vecSize >> 6; |
||||
for( ; log2BlockSize > 1; log2BlockSize >>= 1 ) |
||||
{ |
||||
++numMerges; |
||||
} |
||||
// Check to see if the input vector size is a power of 2, if not we will need last merge pass
|
||||
numMerges += isSizePowerOf2(vecSize)? 1: 0; |
||||
|
||||
// Allocate a flipflop buffer because the merge passes are out of place
|
||||
oclMat tmpKeyBuffer(keys.size(), keys.type()); |
||||
oclMat tmpValBuffer(vals.size(), vals.type()); |
||||
args.resize(8); |
||||
|
||||
args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize); |
||||
args[6] = std::make_pair(keylds_size, (void*)NULL); |
||||
args[7] = std::make_pair(vallds_size, (void*)NULL); |
||||
|
||||
for(size_t pass = 1; pass <= numMerges; ++pass ) |
||||
{ |
||||
// For each pass, flip the input-output buffers
|
||||
if( pass & 0x1 ) |
||||
{ |
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||
args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); |
||||
args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); |
||||
} |
||||
else |
||||
{ |
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); |
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); |
||||
args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||
args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||
} |
||||
// For each pass, the merge window doubles
|
||||
unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) ); |
||||
args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize); |
||||
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||
} |
||||
// If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy
|
||||
// the results back into the input array
|
||||
if( numMerges & 1 ) |
||||
{ |
||||
tmpKeyBuffer.copyTo(keys); |
||||
tmpValBuffer.copyTo(vals); |
||||
} |
||||
} |
||||
} /* merge_sort */ |
||||
|
||||
} |
||||
} /* namespace cv { namespace ocl */ |
||||
|
||||
|
||||
void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan) |
||||
{ |
||||
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||
CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
|
||||
CV_Assert( vecSize <= static_cast<size_t>(keys.cols) ); |
||||
switch(method) |
||||
{ |
||||
case SORT_BITONIC: |
||||
bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan); |
||||
break; |
||||
case SORT_SELECTION: |
||||
selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan); |
||||
break; |
||||
case SORT_RADIX: |
||||
radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan); |
||||
break; |
||||
case SORT_MERGE: |
||||
merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan) |
||||
{ |
||||
CV_Assert( keys.size() == vals.size() ); |
||||
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||
size_t vecSize = static_cast<size_t>(keys.cols); |
||||
sortByKey(keys, vals, vecSize, method, isGreaterThan); |
||||
} |
@ -0,0 +1,244 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#include <map> |
||||
#include <functional> |
||||
#include "precomp.hpp" |
||||
|
||||
using namespace std; |
||||
using namespace cvtest; |
||||
using namespace testing; |
||||
using namespace cv; |
||||
|
||||
|
||||
namespace |
||||
{ |
||||
IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool) |
||||
IMPLEMENT_PARAM_CLASS(InputSize, int) |
||||
IMPLEMENT_PARAM_CLASS(SortMethod, int) |
||||
|
||||
|
||||
template<class T>
|
||||
struct KV_CVTYPE{ static int toType() {return 0;} }; |
||||
|
||||
template<> struct KV_CVTYPE<int> { static int toType() {return CV_32SC1;} }; |
||||
template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} }; |
||||
template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} }; |
||||
template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} }; |
||||
|
||||
template<class key_type, class val_type> |
||||
bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2) |
||||
{ |
||||
return p1.first > p2.first; |
||||
} |
||||
|
||||
template<class key_type, class val_type> |
||||
bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2) |
||||
{ |
||||
return p1.first < p2.first; |
||||
} |
||||
|
||||
template<class key_type, class val_type> |
||||
void toKVPair( |
||||
MatConstIterator_<key_type> kit, |
||||
MatConstIterator_<val_type> vit, |
||||
int vecSize, |
||||
vector<pair<key_type, val_type> >& kvres |
||||
) |
||||
{ |
||||
kvres.clear(); |
||||
for(int i = 0; i < vecSize; i ++) |
||||
{ |
||||
kvres.push_back(make_pair(*kit, *vit)); |
||||
++kit; |
||||
++vit; |
||||
} |
||||
} |
||||
|
||||
template<class key_type, class val_type> |
||||
void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false) |
||||
{ |
||||
vector<pair<key_type, val_type> > kvres; |
||||
toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres); |
||||
|
||||
if(isGreater) |
||||
{ |
||||
std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>); |
||||
} |
||||
else |
||||
{ |
||||
std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>); |
||||
} |
||||
key_type * kptr = keys.ptr<key_type>(); |
||||
val_type * vptr = vals.ptr<val_type>(); |
||||
for(int i = 0; i < keys.cols; i ++) |
||||
{ |
||||
kptr[i] = kvres[i].first; |
||||
vptr[i] = kvres[i].second; |
||||
} |
||||
} |
||||
|
||||
class SortByKey_STL |
||||
{ |
||||
public: |
||||
static void sort(cv::Mat&, cv::Mat&, bool is_gt); |
||||
private: |
||||
typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool); |
||||
SortByKey_STL(); |
||||
quick_sorter quick_sorters[CV_64FC4][CV_64FC4]; |
||||
static SortByKey_STL instance; |
||||
}; |
||||
|
||||
SortByKey_STL SortByKey_STL::instance = SortByKey_STL(); |
||||
|
||||
SortByKey_STL::SortByKey_STL() |
||||
{ |
||||
memset(instance.quick_sorters, 0, sizeof(quick_sorters)); |
||||
#define NEW_SORTER(KT, VT) \ |
||||
instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>; |
||||
|
||||
NEW_SORTER(int, int); |
||||
NEW_SORTER(int, Vec2i); |
||||
NEW_SORTER(int, float); |
||||
NEW_SORTER(int, Vec2f); |
||||
|
||||
NEW_SORTER(float, int); |
||||
NEW_SORTER(float, Vec2i); |
||||
NEW_SORTER(float, float); |
||||
NEW_SORTER(float, Vec2f); |
||||
#undef NEW_SORTER |
||||
} |
||||
|
||||
void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt) |
||||
{ |
||||
instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt); |
||||
} |
||||
|
||||
bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_, |
||||
const Mat& /*dkeys_*/, const Mat& dvals_) |
||||
{ |
||||
int cn_val = gvals_.channels(); |
||||
int count = gkeys_.cols; |
||||
|
||||
//for convenience we convert depth to float and channels to 1
|
||||
Mat gkeys, gvals, dkeys, dvals; |
||||
gkeys_.reshape(1).convertTo(gkeys, CV_32F); |
||||
gvals_.reshape(1).convertTo(gvals, CV_32F); |
||||
//dkeys_.reshape(1).convertTo(dkeys, CV_32F);
|
||||
dvals_.reshape(1).convertTo(dvals, CV_32F); |
||||
float * gkptr = gkeys.ptr<float>(); |
||||
float * gvptr = gvals.ptr<float>(); |
||||
//float * dkptr = dkeys.ptr<float>();
|
||||
float * dvptr = dvals.ptr<float>(); |
||||
|
||||
for(int i = 0; i < count - 1; ++i) |
||||
{ |
||||
int iden_count = 0; |
||||
// firstly calculate the number of identical keys
|
||||
while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count]) |
||||
{ |
||||
++ iden_count; |
||||
} |
||||
|
||||
// sort dv and gv
|
||||
int num_of_val = (iden_count + 1) * cn_val; |
||||
std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val); |
||||
std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val); |
||||
|
||||
// then check if [i, i + iden_count) is the same
|
||||
for(int j = 0; j < num_of_val; ++j) |
||||
{ |
||||
if(gvptr[i + j] != dvptr[i + j]) |
||||
{ |
||||
return false; |
||||
} |
||||
} |
||||
i += iden_count; |
||||
} |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
#define INPUT_SIZES Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
|
||||
#define KEY_TYPES Values(MatType(CV_32SC1), MatType(CV_32FC1)) |
||||
#define VAL_TYPES Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2)) |
||||
#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/) |
||||
#define F_OR_T Values(IsGreaterThan(false), IsGreaterThan(true)) |
||||
|
||||
PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan) |
||||
{ |
||||
InputSize input_size; |
||||
MatType key_type, val_type; |
||||
SortMethod method; |
||||
IsGreaterThan is_gt; |
||||
|
||||
Mat mat_key, mat_val; |
||||
virtual void SetUp() |
||||
{ |
||||
input_size = GET_PARAM(0); |
||||
key_type = GET_PARAM(1); |
||||
val_type = GET_PARAM(2); |
||||
method = GET_PARAM(3); |
||||
is_gt = GET_PARAM(4); |
||||
|
||||
using namespace cv; |
||||
// fill key and val
|
||||
mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX); |
||||
mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX); |
||||
} |
||||
}; |
||||
|
||||
TEST_P(SortByKey, Accuracy) |
||||
{ |
||||
using namespace cv; |
||||
ocl::oclMat oclmat_key(mat_key); |
||||
ocl::oclMat oclmat_val(mat_val); |
||||
|
||||
ocl::sortByKey(oclmat_key, oclmat_val, method, is_gt); |
||||
SortByKey_STL::sort(mat_key, mat_val, is_gt); |
||||
|
||||
EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0); |
||||
EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val)); |
||||
} |
||||
INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T)); |
Loading…
Reference in new issue