mirror of https://github.com/opencv/opencv.git
Most codes are ported from AMD's Bolt library. Four methods are implemented: SORT_BITONIC, // only support power-of-2 buffer size SORT_SELECTION, // cannot sort duplicate keys SORT_MERGE, SORT_RADIX // only support signed int/float keyspull/1189/head
parent
c48d3ad71e
commit
e1f43e3b03
6 changed files with 1441 additions and 0 deletions
@ -0,0 +1,176 @@ |
|||||||
|
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||||
|
// |
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||||
|
// |
||||||
|
// By downloading, copying, installing or using the software you agree to this license. |
||||||
|
// If you do not agree to this license, do not download, install, |
||||||
|
// copy or use the software. |
||||||
|
// |
||||||
|
// |
||||||
|
// License Agreement |
||||||
|
// For Open Source Computer Vision Library |
||||||
|
// |
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||||
|
// Third party copyrights are property of their respective owners. |
||||||
|
// |
||||||
|
// @Authors |
||||||
|
// Peng Xiao, pengxiao@outlook.com |
||||||
|
// |
||||||
|
// Redistribution and use in source and binary forms, with or without modification, |
||||||
|
// are permitted provided that the following conditions are met: |
||||||
|
// |
||||||
|
// * Redistribution's of source code must retain the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer. |
||||||
|
// |
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer in the documentation |
||||||
|
// and/or other oclMaterials provided with the distribution. |
||||||
|
// |
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products |
||||||
|
// derived from this software without specific prior written permission. |
||||||
|
// |
||||||
|
// This software is provided by the copyright holders and contributors as is and |
||||||
|
// any express or implied warranties, including, but not limited to, the implied |
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||||
|
// indirect, incidental, special, exemplary, or consequential damages |
||||||
|
// (including, but not limited to, procurement of substitute goods or services; |
||||||
|
// loss of use, data, or profits; or business interruption) however caused |
||||||
|
// and on any theory of liability, whether in contract, strict liability, |
||||||
|
// or tort (including negligence or otherwise) arising in any way out of |
||||||
|
// the use of this software, even if advised of the possibility of such damage. |
||||||
|
// |
||||||
|
//M*/ |
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable |
||||||
|
|
||||||
|
#ifndef N // number of radices |
||||||
|
#define N 4 |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef K_T |
||||||
|
#define K_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef V_T |
||||||
|
#define V_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef IS_GT |
||||||
|
#define IS_GT 0 |
||||||
|
#endif |
||||||
|
|
||||||
|
|
||||||
|
// from Thrust::b40c, link: |
||||||
|
// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h |
||||||
|
__inline uint convertKey(uint converted_key) |
||||||
|
{ |
||||||
|
#ifdef K_FLT |
||||||
|
unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000; |
||||||
|
converted_key ^= mask; |
||||||
|
#elif defined(K_INT) |
||||||
|
const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1); |
||||||
|
converted_key ^= SIGN_MASK; |
||||||
|
#else |
||||||
|
|
||||||
|
#endif |
||||||
|
return converted_key; |
||||||
|
} |
||||||
|
|
||||||
|
//FIXME(pengx17): |
||||||
|
// exclusive scan, need to be optimized as this is too naive... |
||||||
|
kernel |
||||||
|
void naiveScanAddition( |
||||||
|
__global int * input, |
||||||
|
__global int * output, |
||||||
|
int size |
||||||
|
) |
||||||
|
{ |
||||||
|
if(get_global_id(0) == 0) |
||||||
|
{ |
||||||
|
output[0] = 0; |
||||||
|
for(int i = 1; i < size; i ++) |
||||||
|
{ |
||||||
|
output[i] = output[i - 1] + input[i - 1]; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// following is ported from |
||||||
|
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl |
||||||
|
kernel |
||||||
|
void histogramRadixN ( |
||||||
|
__global K_T* unsortedKeys, |
||||||
|
__global int * buckets, |
||||||
|
uint shiftCount |
||||||
|
) |
||||||
|
{ |
||||||
|
const int RADIX_T = N; |
||||||
|
const int RADICES_T = (1 << RADIX_T); |
||||||
|
const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; |
||||||
|
const int MASK_T = (1 << RADIX_T) - 1; |
||||||
|
int localBuckets[16] = {0,0,0,0,0,0,0,0, |
||||||
|
0,0,0,0,0,0,0,0}; |
||||||
|
int globalId = get_global_id(0); |
||||||
|
int numOfGroups = get_num_groups(0); |
||||||
|
|
||||||
|
/* Calculate thread-histograms */ |
||||||
|
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) |
||||||
|
{ |
||||||
|
uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)])); |
||||||
|
value = (value >> shiftCount) & MASK_T; |
||||||
|
#if IS_GT |
||||||
|
localBuckets[RADICES_T - value - 1]++; |
||||||
|
#else |
||||||
|
localBuckets[value]++; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) |
||||||
|
{ |
||||||
|
buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
kernel |
||||||
|
void permuteRadixN ( |
||||||
|
__global K_T* unsortedKeys, |
||||||
|
__global V_T* unsortedVals, |
||||||
|
__global int* scanedBuckets, |
||||||
|
uint shiftCount, |
||||||
|
__global K_T* sortedKeys, |
||||||
|
__global V_T* sortedVals |
||||||
|
) |
||||||
|
{ |
||||||
|
const int RADIX_T = N; |
||||||
|
const int RADICES_T = (1 << RADIX_T); |
||||||
|
const int MASK_T = (1<<RADIX_T) -1; |
||||||
|
|
||||||
|
int globalId = get_global_id(0); |
||||||
|
int numOfGroups = get_num_groups(0); |
||||||
|
const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N; |
||||||
|
int localIndex[16]; |
||||||
|
|
||||||
|
/*Load the index to local memory*/ |
||||||
|
for(int i = 0; i < RADICES_T; ++i) |
||||||
|
{ |
||||||
|
#if IS_GT |
||||||
|
localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)]; |
||||||
|
#else |
||||||
|
localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)]; |
||||||
|
#endif |
||||||
|
} |
||||||
|
/* Permute elements to appropriate location */ |
||||||
|
for(int i = 0; i < RADICES_T; ++i) |
||||||
|
{ |
||||||
|
int old_idx = mad24(globalId, RADICES_T, i); |
||||||
|
K_T ovalue = unsortedKeys[old_idx]; |
||||||
|
uint value = convertKey(as_uint(ovalue)); |
||||||
|
uint maskedValue = (value >> shiftCount) & MASK_T; |
||||||
|
uint index = localIndex[maskedValue]; |
||||||
|
sortedKeys[index] = ovalue; |
||||||
|
sortedVals[index] = unsortedVals[old_idx]; |
||||||
|
localIndex[maskedValue] = index + 1; |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,245 @@ |
|||||||
|
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||||
|
// |
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||||
|
// |
||||||
|
// By downloading, copying, installing or using the software you agree to this license. |
||||||
|
// If you do not agree to this license, do not download, install, |
||||||
|
// copy or use the software. |
||||||
|
// |
||||||
|
// |
||||||
|
// License Agreement |
||||||
|
// For Open Source Computer Vision Library |
||||||
|
// |
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||||
|
// Third party copyrights are property of their respective owners. |
||||||
|
// |
||||||
|
// @Authors |
||||||
|
// Peng Xiao, pengxiao@outlook.com |
||||||
|
// |
||||||
|
// Redistribution and use in source and binary forms, with or without modification, |
||||||
|
// are permitted provided that the following conditions are met: |
||||||
|
// |
||||||
|
// * Redistribution's of source code must retain the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer. |
||||||
|
// |
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer in the documentation |
||||||
|
// and/or other oclMaterials provided with the distribution. |
||||||
|
// |
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products |
||||||
|
// derived from this software without specific prior written permission. |
||||||
|
// |
||||||
|
// This software is provided by the copyright holders and contributors as is and |
||||||
|
// any express or implied warranties, including, but not limited to, the implied |
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||||
|
// indirect, incidental, special, exemplary, or consequential damages |
||||||
|
// (including, but not limited to, procurement of substitute goods or services; |
||||||
|
// loss of use, data, or profits; or business interruption) however caused |
||||||
|
// and on any theory of liability, whether in contract, strict liability, |
||||||
|
// or tort (including negligence or otherwise) arising in any way out of |
||||||
|
// the use of this software, even if advised of the possibility of such damage. |
||||||
|
// |
||||||
|
//M*/ |
||||||
|
|
||||||
|
#ifndef K_T |
||||||
|
#define K_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef V_T |
||||||
|
#define V_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef IS_GT |
||||||
|
#define IS_GT false |
||||||
|
#endif |
||||||
|
|
||||||
|
#if IS_GT |
||||||
|
#define my_comp(x,y) ((x) > (y)) |
||||||
|
#else |
||||||
|
#define my_comp(x,y) ((x) < (y)) |
||||||
|
#endif |
||||||
|
|
||||||
|
/////////////////////// Bitonic sort //////////////////////////// |
||||||
|
// ported from |
||||||
|
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl |
||||||
|
__kernel |
||||||
|
void bitonicSort |
||||||
|
( |
||||||
|
__global K_T * keys, |
||||||
|
__global V_T * vals, |
||||||
|
int count, |
||||||
|
int stage, |
||||||
|
int passOfStage |
||||||
|
) |
||||||
|
{ |
||||||
|
const int threadId = get_global_id(0); |
||||||
|
if(threadId >= count / 2) |
||||||
|
{ |
||||||
|
return; |
||||||
|
} |
||||||
|
const int pairDistance = 1 << (stage - passOfStage); |
||||||
|
const int blockWidth = 2 * pairDistance; |
||||||
|
|
||||||
|
int leftId = min( (threadId % pairDistance) |
||||||
|
+ (threadId / pairDistance) * blockWidth, count ); |
||||||
|
|
||||||
|
int rightId = min( leftId + pairDistance, count ); |
||||||
|
|
||||||
|
int temp; |
||||||
|
|
||||||
|
const V_T lval = vals[leftId]; |
||||||
|
const V_T rval = vals[rightId]; |
||||||
|
|
||||||
|
const K_T lkey = keys[leftId]; |
||||||
|
const K_T rkey = keys[rightId]; |
||||||
|
|
||||||
|
int sameDirectionBlockWidth = 1 << stage; |
||||||
|
|
||||||
|
if((threadId/sameDirectionBlockWidth) % 2 == 1) |
||||||
|
{ |
||||||
|
temp = rightId; |
||||||
|
rightId = leftId; |
||||||
|
leftId = temp; |
||||||
|
} |
||||||
|
|
||||||
|
const bool compareResult = my_comp(lkey, rkey); |
||||||
|
|
||||||
|
if(compareResult) |
||||||
|
{ |
||||||
|
keys[rightId] = rkey; |
||||||
|
keys[leftId] = lkey; |
||||||
|
vals[rightId] = rval; |
||||||
|
vals[leftId] = lval; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
keys[rightId] = lkey; |
||||||
|
keys[leftId] = rkey; |
||||||
|
vals[rightId] = lval; |
||||||
|
vals[leftId] = rval; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/////////////////////// Selection sort //////////////////////////// |
||||||
|
//kernel is ported from Bolt library: |
||||||
|
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl |
||||||
|
__kernel |
||||||
|
void selectionSortLocal |
||||||
|
( |
||||||
|
__global K_T * keys, |
||||||
|
__global V_T * vals, |
||||||
|
const int count, |
||||||
|
__local K_T * scratch |
||||||
|
) |
||||||
|
{ |
||||||
|
int i = get_local_id(0); // index in workgroup |
||||||
|
int numOfGroups = get_num_groups(0); // index in workgroup |
||||||
|
int groupID = get_group_id(0); |
||||||
|
int wg = get_local_size(0); // workgroup size = block size |
||||||
|
int n; // number of elements to be processed for this work group |
||||||
|
|
||||||
|
int offset = groupID * wg; |
||||||
|
int same = 0; |
||||||
|
|
||||||
|
vals += offset; |
||||||
|
keys += offset; |
||||||
|
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; |
||||||
|
|
||||||
|
int clamped_i= min(i, n - 1); |
||||||
|
|
||||||
|
K_T key1 = keys[clamped_i], key2; |
||||||
|
V_T val1 = vals[clamped_i]; |
||||||
|
scratch[i] = key1; |
||||||
|
barrier(CLK_LOCAL_MEM_FENCE); |
||||||
|
|
||||||
|
if(i >= n) |
||||||
|
{ |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
int pos = 0; |
||||||
|
for (int j=0;j<n;++j) |
||||||
|
{ |
||||||
|
key2 = scratch[j]; |
||||||
|
if(my_comp(key2, key1)) |
||||||
|
pos++;//calculate the rank of this element in this work group |
||||||
|
else |
||||||
|
{ |
||||||
|
if(my_comp(key1, key2)) |
||||||
|
continue; |
||||||
|
else |
||||||
|
{ |
||||||
|
// key1 and key2 are same |
||||||
|
same++; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
for (int j=0; j< same; j++) |
||||||
|
{ |
||||||
|
vals[pos + j] = val1; |
||||||
|
keys[pos + j] = key1; |
||||||
|
} |
||||||
|
} |
||||||
|
__kernel |
||||||
|
void selectionSortFinal |
||||||
|
( |
||||||
|
__global K_T * keys, |
||||||
|
__global V_T * vals, |
||||||
|
const int count |
||||||
|
) |
||||||
|
{ |
||||||
|
const int i = get_local_id(0); // index in workgroup |
||||||
|
const int numOfGroups = get_num_groups(0); // index in workgroup |
||||||
|
const int groupID = get_group_id(0); |
||||||
|
const int wg = get_local_size(0); // workgroup size = block size |
||||||
|
int pos = 0, same = 0; |
||||||
|
const int offset = get_group_id(0) * wg; |
||||||
|
const int remainder = count - wg*(numOfGroups-1); |
||||||
|
|
||||||
|
if((offset + i ) >= count) |
||||||
|
return; |
||||||
|
V_T val1 = vals[offset + i]; |
||||||
|
|
||||||
|
K_T key1 = keys[offset + i]; |
||||||
|
K_T key2; |
||||||
|
|
||||||
|
for(int j=0; j<numOfGroups-1; j++ ) |
||||||
|
{ |
||||||
|
for(int k=0; k<wg; k++) |
||||||
|
{ |
||||||
|
key2 = keys[j*wg + k]; |
||||||
|
if(my_comp(key1, key2)) |
||||||
|
break; |
||||||
|
else |
||||||
|
{ |
||||||
|
//Increment only if the value is not the same. |
||||||
|
if(my_comp(key2, key1)) |
||||||
|
pos++; |
||||||
|
else |
||||||
|
same++; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for(int k=0; k<remainder; k++) |
||||||
|
{ |
||||||
|
key2 = keys[(numOfGroups-1)*wg + k]; |
||||||
|
if(my_comp(key1, key2)) |
||||||
|
break; |
||||||
|
else |
||||||
|
{ |
||||||
|
//Don't increment if the value is the same. |
||||||
|
if(my_comp(key2, key1)) |
||||||
|
pos++; |
||||||
|
else |
||||||
|
same++; |
||||||
|
} |
||||||
|
} |
||||||
|
for (int j=0; j< same; j++) |
||||||
|
{ |
||||||
|
vals[pos + j] = val1; |
||||||
|
keys[pos + j] = key1; |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,296 @@ |
|||||||
|
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||||
|
// |
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||||
|
// |
||||||
|
// By downloading, copying, installing or using the software you agree to this license. |
||||||
|
// If you do not agree to this license, do not download, install, |
||||||
|
// copy or use the software. |
||||||
|
// |
||||||
|
// |
||||||
|
// License Agreement |
||||||
|
// For Open Source Computer Vision Library |
||||||
|
// |
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||||
|
// Third party copyrights are property of their respective owners. |
||||||
|
// |
||||||
|
// @Authors |
||||||
|
// Peng Xiao, pengxiao@outlook.com |
||||||
|
// |
||||||
|
// Redistribution and use in source and binary forms, with or without modification, |
||||||
|
// are permitted provided that the following conditions are met: |
||||||
|
// |
||||||
|
// * Redistribution's of source code must retain the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer. |
||||||
|
// |
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||||
|
// this list of conditions and the following disclaimer in the documentation |
||||||
|
// and/or other oclMaterials provided with the distribution. |
||||||
|
// |
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products |
||||||
|
// derived from this software without specific prior written permission. |
||||||
|
// |
||||||
|
// This software is provided by the copyright holders and contributors as is and |
||||||
|
// any express or implied warranties, including, but not limited to, the implied |
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||||
|
// indirect, incidental, special, exemplary, or consequential damages |
||||||
|
// (including, but not limited to, procurement of substitute goods or services; |
||||||
|
// loss of use, data, or profits; or business interruption) however caused |
||||||
|
// and on any theory of liability, whether in contract, strict liability, |
||||||
|
// or tort (including negligence or otherwise) arising in any way out of |
||||||
|
// the use of this software, even if advised of the possibility of such damage. |
||||||
|
// |
||||||
|
//M*/ |
||||||
|
|
||||||
|
#ifndef K_T |
||||||
|
#define K_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef V_T |
||||||
|
#define V_T float |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef IS_GT |
||||||
|
#define IS_GT false |
||||||
|
#endif |
||||||
|
|
||||||
|
#if IS_GT |
||||||
|
#define my_comp(x,y) ((x) > (y)) |
||||||
|
#else |
||||||
|
#define my_comp(x,y) ((x) < (y)) |
||||||
|
#endif |
||||||
|
|
||||||
|
///////////// parallel merge sort /////////////// |
||||||
|
// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl |
||||||
|
uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) |
||||||
|
{ |
||||||
|
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence |
||||||
|
uint firstIndex = left; |
||||||
|
uint lastIndex = right; |
||||||
|
|
||||||
|
// This loops through [firstIndex, lastIndex) |
||||||
|
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch, |
||||||
|
// this while loop will be divergent within a wavefront |
||||||
|
while( firstIndex < lastIndex ) |
||||||
|
{ |
||||||
|
K_T dataVal = data[ firstIndex ]; |
||||||
|
|
||||||
|
// This branch will create divergent wavefronts |
||||||
|
if( my_comp( dataVal, searchVal ) ) |
||||||
|
{ |
||||||
|
firstIndex = firstIndex+1; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return firstIndex; |
||||||
|
} |
||||||
|
|
||||||
|
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted |
||||||
|
// by a base pointer and left and right index for a particular candidate value. The comparison operator is |
||||||
|
// passed as a functor parameter my_comp |
||||||
|
// This function returns an index that is the first index whos value would be equal to the searched value |
||||||
|
uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) |
||||||
|
{ |
||||||
|
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence |
||||||
|
uint firstIndex = left; |
||||||
|
uint lastIndex = right; |
||||||
|
|
||||||
|
// This loops through [firstIndex, lastIndex) |
||||||
|
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch, |
||||||
|
// this while loop will be divergent within a wavefront |
||||||
|
while( firstIndex < lastIndex ) |
||||||
|
{ |
||||||
|
// midIndex is the average of first and last, rounded down |
||||||
|
uint midIndex = ( firstIndex + lastIndex ) / 2; |
||||||
|
K_T midValue = data[ midIndex ]; |
||||||
|
|
||||||
|
// This branch will create divergent wavefronts |
||||||
|
if( my_comp( midValue, searchVal ) ) |
||||||
|
{ |
||||||
|
firstIndex = midIndex+1; |
||||||
|
// printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex ); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
lastIndex = midIndex; |
||||||
|
// printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex ); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return firstIndex; |
||||||
|
} |
||||||
|
|
||||||
|
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted |
||||||
|
// by a base pointer and left and right index for a particular candidate value. The comparison operator is |
||||||
|
// passed as a functor parameter my_comp |
||||||
|
// This function returns an index that is the first index whos value would be greater than the searched value |
||||||
|
// If the search value is not found in the sequence, upperbound returns the same result as lowerbound |
||||||
|
uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) |
||||||
|
{ |
||||||
|
uint upperBound = lowerBoundBinary( data, left, right, searchVal ); |
||||||
|
|
||||||
|
// printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound ); |
||||||
|
// If upperBound == right, then searchVal was not found in the sequence. Just return. |
||||||
|
if( upperBound != right ) |
||||||
|
{ |
||||||
|
// While the values are equal i.e. !(x < y) && !(y < x) increment the index |
||||||
|
K_T upperValue = data[ upperBound ]; |
||||||
|
while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) ) |
||||||
|
{ |
||||||
|
upperBound++; |
||||||
|
upperValue = data[ upperBound ]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return upperBound; |
||||||
|
} |
||||||
|
|
||||||
|
// This kernel implements merging of blocks of sorted data. The input to this kernel most likely is |
||||||
|
// the output of blockInsertionSortTemplate. It is expected that the source array contains multiple |
||||||
|
// blocks, each block is independently sorted. The goal is to write into the output buffer half as |
||||||
|
// many blocks, of double the size. The even and odd blocks are stably merged together to form |
||||||
|
// a new sorted block of twice the size. The algorithm is out-of-place. |
||||||
|
kernel void merge( |
||||||
|
global K_T* iKey_ptr, |
||||||
|
global V_T* iValue_ptr, |
||||||
|
global K_T* oKey_ptr, |
||||||
|
global V_T* oValue_ptr, |
||||||
|
const uint srcVecSize, |
||||||
|
const uint srcLogicalBlockSize, |
||||||
|
local K_T* key_lds, |
||||||
|
local V_T* val_lds |
||||||
|
) |
||||||
|
{ |
||||||
|
size_t globalID = get_global_id( 0 ); |
||||||
|
size_t groupID = get_group_id( 0 ); |
||||||
|
size_t localID = get_local_id( 0 ); |
||||||
|
size_t wgSize = get_local_size( 0 ); |
||||||
|
|
||||||
|
// Abort threads that are passed the end of the input vector |
||||||
|
if( globalID >= srcVecSize ) |
||||||
|
return; // on SI this doesn't mess-up barriers |
||||||
|
|
||||||
|
// For an element in sequence A, find the lowerbound index for it in sequence B |
||||||
|
uint srcBlockNum = globalID / srcLogicalBlockSize; |
||||||
|
uint srcBlockIndex = globalID % srcLogicalBlockSize; |
||||||
|
|
||||||
|
// printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex ); |
||||||
|
|
||||||
|
// Pairs of even-odd blocks will be merged together |
||||||
|
// An even block should search for an insertion point in the next odd block, |
||||||
|
// and the odd block should look for an insertion point in the corresponding previous even block |
||||||
|
uint dstLogicalBlockSize = srcLogicalBlockSize<<1; |
||||||
|
uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 ); |
||||||
|
leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize; |
||||||
|
leftBlockIndex = min( leftBlockIndex, srcVecSize ); |
||||||
|
uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize ); |
||||||
|
|
||||||
|
// if( localID == 0 ) |
||||||
|
// { |
||||||
|
// printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex ); |
||||||
|
// } |
||||||
|
|
||||||
|
// For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex |
||||||
|
// uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex; |
||||||
|
uint insertionIndex = 0; |
||||||
|
if( (srcBlockNum & 0x1) == 0 ) |
||||||
|
{ |
||||||
|
insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; |
||||||
|
} |
||||||
|
|
||||||
|
// The index of an element in the result sequence is the summation of it's indixes in the two input |
||||||
|
// sequences |
||||||
|
uint dstBlockIndex = srcBlockIndex + insertionIndex; |
||||||
|
uint dstBlockNum = srcBlockNum/2; |
||||||
|
|
||||||
|
// if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 ) |
||||||
|
// { |
||||||
|
// printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex ); |
||||||
|
// printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] ); |
||||||
|
// } |
||||||
|
oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ]; |
||||||
|
oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ]; |
||||||
|
// printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex ); |
||||||
|
} |
||||||
|
|
||||||
|
kernel void blockInsertionSort( |
||||||
|
global K_T* key_ptr, |
||||||
|
global V_T* value_ptr, |
||||||
|
const uint vecSize, |
||||||
|
local K_T* key_lds, |
||||||
|
local V_T* val_lds |
||||||
|
) |
||||||
|
{ |
||||||
|
size_t gloId = get_global_id( 0 ); |
||||||
|
size_t groId = get_group_id( 0 ); |
||||||
|
size_t locId = get_local_id( 0 ); |
||||||
|
size_t wgSize = get_local_size( 0 ); |
||||||
|
|
||||||
|
bool in_range = gloId < vecSize; |
||||||
|
K_T key; |
||||||
|
V_T val; |
||||||
|
// Abort threads that are passed the end of the input vector |
||||||
|
if (in_range) |
||||||
|
{ |
||||||
|
// Make a copy of the entire input array into fast local memory |
||||||
|
key = key_ptr[ gloId ]; |
||||||
|
val = value_ptr[ gloId ]; |
||||||
|
key_lds[ locId ] = key; |
||||||
|
val_lds[ locId ] = val; |
||||||
|
} |
||||||
|
barrier( CLK_LOCAL_MEM_FENCE ); |
||||||
|
// Sorts a workgroup using a naive insertion sort |
||||||
|
// The sort uses one thread within a workgroup to sort the entire workgroup |
||||||
|
if( locId == 0 && in_range ) |
||||||
|
{ |
||||||
|
// The last workgroup may have an irregular size, so we calculate a per-block endIndex |
||||||
|
// endIndex is essentially emulating a mod operator with subtraction and multiply |
||||||
|
size_t endIndex = vecSize - ( groId * wgSize ); |
||||||
|
endIndex = min( endIndex, wgSize ); |
||||||
|
|
||||||
|
// printf( "Debug: endIndex[%i]=%i\n", groId, endIndex ); |
||||||
|
|
||||||
|
// Indices are signed because the while loop will generate a -1 index inside of the max function |
||||||
|
for( int currIndex = 1; currIndex < endIndex; ++currIndex ) |
||||||
|
{ |
||||||
|
key = key_lds[ currIndex ]; |
||||||
|
val = val_lds[ currIndex ]; |
||||||
|
int scanIndex = currIndex; |
||||||
|
K_T ldsKey = key_lds[scanIndex - 1]; |
||||||
|
while( scanIndex > 0 && my_comp( key, ldsKey ) ) |
||||||
|
{ |
||||||
|
V_T ldsVal = val_lds[scanIndex - 1]; |
||||||
|
|
||||||
|
// If the keys are being swapped, make sure the values are swapped identicaly |
||||||
|
key_lds[ scanIndex ] = ldsKey; |
||||||
|
val_lds[ scanIndex ] = ldsVal; |
||||||
|
|
||||||
|
scanIndex = scanIndex - 1; |
||||||
|
ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1 |
||||||
|
} |
||||||
|
key_lds[ scanIndex ] = key; |
||||||
|
val_lds[ scanIndex ] = val; |
||||||
|
} |
||||||
|
} |
||||||
|
barrier( CLK_LOCAL_MEM_FENCE ); |
||||||
|
|
||||||
|
if(in_range) |
||||||
|
{ |
||||||
|
key = key_lds[ locId ]; |
||||||
|
key_ptr[ gloId ] = key; |
||||||
|
|
||||||
|
val = val_lds[ locId ]; |
||||||
|
value_ptr[ gloId ] = val; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
///////////// Radix sort from b40c library ///////////// |
@ -0,0 +1,453 @@ |
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// @Authors
|
||||||
|
// Peng Xiao, pengxiao@outlook.com
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other oclMaterials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors as is and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
|
#include <iomanip> |
||||||
|
#include "precomp.hpp" |
||||||
|
|
||||||
|
namespace cv |
||||||
|
{ |
||||||
|
namespace ocl |
||||||
|
{ |
||||||
|
|
||||||
|
extern const char * kernel_sort_by_key; |
||||||
|
extern const char * kernel_stablesort_by_key; |
||||||
|
extern const char * kernel_radix_sort_by_key; |
||||||
|
|
||||||
|
//TODO(pengx17): change this value depending on device other than a constant
|
||||||
|
const static unsigned int GROUP_SIZE = 256; |
||||||
|
|
||||||
|
const char * depth_strings[] = |
||||||
|
{ |
||||||
|
"uchar", //CV_8U
|
||||||
|
"char", //CV_8S
|
||||||
|
"ushort", //CV_16U
|
||||||
|
"short", //CV_16S
|
||||||
|
"int", //CV_32S
|
||||||
|
"float", //CV_32F
|
||||||
|
"double" //CV_64F
|
||||||
|
}; |
||||||
|
|
||||||
|
void genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf) |
||||||
|
{ |
||||||
|
sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s", |
||||||
|
isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]); |
||||||
|
if(vals.oclchannels() > 1) |
||||||
|
{ |
||||||
|
sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels(), 10); |
||||||
|
} |
||||||
|
} |
||||||
|
inline bool isSizePowerOf2(size_t size) |
||||||
|
{ |
||||||
|
return ((size - 1) & (size)) == 0; |
||||||
|
} |
||||||
|
|
||||||
|
namespace bitonic_sort |
||||||
|
{ |
||||||
|
static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||||
|
{ |
||||||
|
CV_Assert(isSizePowerOf2(vecSize)); |
||||||
|
|
||||||
|
Context * cxt = Context::getContext(); |
||||||
|
size_t globalThreads[3] = {vecSize / 2, 1, 1}; |
||||||
|
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||||
|
|
||||||
|
// 2^numStages should be equal to vecSize or the output is invalid
|
||||||
|
int numStages = 0; |
||||||
|
for(int i = vecSize; i > 1; i >>= 1) |
||||||
|
{ |
||||||
|
++numStages; |
||||||
|
} |
||||||
|
char build_opt_buf [100]; |
||||||
|
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||||
|
const int argc = 5; |
||||||
|
std::vector< std::pair<size_t, const void *> > args(argc); |
||||||
|
String kernelname = "bitonicSort"; |
||||||
|
|
||||||
|
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||||
|
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||||
|
args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize); |
||||||
|
|
||||||
|
for(int stage = 0; stage < numStages; ++stage) |
||||||
|
{ |
||||||
|
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); |
||||||
|
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) |
||||||
|
{ |
||||||
|
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); |
||||||
|
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} /* bitonic_sort */ |
||||||
|
|
||||||
|
namespace selection_sort |
||||||
|
{ |
||||||
|
// FIXME:
|
||||||
|
// This function cannot sort arrays with duplicated keys
|
||||||
|
static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||||
|
{ |
||||||
|
CV_Error(-1, "This function is incorrect at the moment."); |
||||||
|
Context * cxt = Context::getContext(); |
||||||
|
|
||||||
|
size_t globalThreads[3] = {vecSize, 1, 1}; |
||||||
|
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||||
|
|
||||||
|
std::vector< std::pair<size_t, const void *> > args; |
||||||
|
char build_opt_buf [100]; |
||||||
|
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||||
|
|
||||||
|
//local
|
||||||
|
String kernelname = "selectionSortLocal"; |
||||||
|
int lds_size = GROUP_SIZE * keys.elemSize(); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize)); |
||||||
|
args.push_back(std::make_pair(lds_size, (void*)NULL)); |
||||||
|
|
||||||
|
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||||
|
|
||||||
|
//final
|
||||||
|
kernelname = "selectionSortFinal"; |
||||||
|
args.pop_back(); |
||||||
|
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||||
|
} |
||||||
|
|
||||||
|
} /* selection_sort */ |
||||||
|
|
||||||
|
|
||||||
|
namespace radix_sort |
||||||
|
{ |
||||||
|
//FIXME(pengx17):
|
||||||
|
// exclusive scan, need to be optimized as this is too naive...
|
||||||
|
//void naive_scan_addition(oclMat& input, oclMat& output)
|
||||||
|
//{
|
||||||
|
// Context * cxt = Context::getContext();
|
||||||
|
// size_t vecSize = input.cols;
|
||||||
|
// size_t globalThreads[3] = {1, 1, 1};
|
||||||
|
// size_t localThreads[3] = {1, 1, 1};
|
||||||
|
//
|
||||||
|
// String kernelname = "naiveScanAddition";
|
||||||
|
//
|
||||||
|
// std::vector< std::pair<size_t, const void *> > args;
|
||||||
|
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
|
||||||
|
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
|
||||||
|
// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
|
||||||
|
// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||||
|
//}
|
||||||
|
|
||||||
|
void naive_scan_addition_cpu(oclMat& input, oclMat& output) |
||||||
|
{ |
||||||
|
Mat m_input = input, m_output(output.size(), output.type()); |
||||||
|
MatIterator_<int> i_mit = m_input.begin<int>(); |
||||||
|
MatIterator_<int> o_mit = m_output.begin<int>(); |
||||||
|
*o_mit = 0; |
||||||
|
++i_mit; |
||||||
|
++o_mit; |
||||||
|
for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit) |
||||||
|
{ |
||||||
|
*o_mit = *(o_mit - 1) + *(i_mit - 1); |
||||||
|
} |
||||||
|
output = m_output; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
//radix sort ported from Bolt
|
||||||
|
static void sort_by_key(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan) |
||||||
|
{ |
||||||
|
CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
|
||||||
|
|
||||||
|
bool isKeyFloat = keys.type() == CV_32F; |
||||||
|
|
||||||
|
const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
|
||||||
|
//local array of 16 elements in the histogram kernel.
|
||||||
|
const int RADICES = (1 << RADIX); //Values handeled by each work-item?
|
||||||
|
|
||||||
|
bool newBuffer = false; |
||||||
|
size_t vecSize = origVecSize; |
||||||
|
|
||||||
|
unsigned int groupSize = RADICES; |
||||||
|
|
||||||
|
size_t mulFactor = groupSize * RADICES; |
||||||
|
|
||||||
|
oclMat buffer_keys, buffer_vals; |
||||||
|
|
||||||
|
if(origVecSize % mulFactor != 0) |
||||||
|
{ |
||||||
|
vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor; |
||||||
|
buffer_keys.create(1, vecSize, keys.type()); |
||||||
|
buffer_vals.create(1, vecSize, vals.type()); |
||||||
|
Scalar padding_value; |
||||||
|
oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1)); |
||||||
|
|
||||||
|
if(isGreaterThan) |
||||||
|
{ |
||||||
|
switch(buffer_keys.depth()) |
||||||
|
{ |
||||||
|
case CV_32F: |
||||||
|
padding_value = Scalar::all(-FLT_MAX); |
||||||
|
break; |
||||||
|
case CV_32S: |
||||||
|
padding_value = Scalar::all(INT_MIN); |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
switch(buffer_keys.depth()) |
||||||
|
{ |
||||||
|
case CV_32F: |
||||||
|
padding_value = Scalar::all(FLT_MAX); |
||||||
|
break; |
||||||
|
case CV_32S: |
||||||
|
padding_value = Scalar::all(INT_MAX); |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
ocl::copyMakeBorder( |
||||||
|
keys(Rect(0,0,origVecSize,1)), buffer_keys,
|
||||||
|
0, 0, 0, vecSize - origVecSize,
|
||||||
|
BORDER_CONSTANT, padding_value); |
||||||
|
vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals); |
||||||
|
newBuffer = true; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
buffer_keys = keys; |
||||||
|
buffer_vals = vals; |
||||||
|
newBuffer = false; |
||||||
|
} |
||||||
|
oclMat swap_input_keys(1, vecSize, keys.type()); |
||||||
|
oclMat swap_input_vals(1, vecSize, vals.type()); |
||||||
|
oclMat hist_bin_keys(1, vecSize, CV_32SC1); |
||||||
|
oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1); |
||||||
|
|
||||||
|
Context * cxt = Context::getContext(); |
||||||
|
|
||||||
|
size_t globalThreads[3] = {vecSize / RADICES, 1, 1}; |
||||||
|
size_t localThreads[3] = {groupSize, 1, 1}; |
||||||
|
|
||||||
|
std::vector< std::pair<size_t, const void *> > args; |
||||||
|
char build_opt_buf [100]; |
||||||
|
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||||
|
|
||||||
|
//additional build option for radix sort
|
||||||
|
sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
|
||||||
|
|
||||||
|
String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")}; |
||||||
|
|
||||||
|
int swap = 0; |
||||||
|
for(int bits = 0; bits < (keys.elemSize() * 8); bits += RADIX) |
||||||
|
{ |
||||||
|
args.clear(); |
||||||
|
//Do a histogram pass locally
|
||||||
|
if(swap == 0) |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||||
|
} |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); |
||||||
|
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads, |
||||||
|
args, -1, -1, build_opt_buf); |
||||||
|
|
||||||
|
args.clear(); |
||||||
|
//Perform a global scan
|
||||||
|
naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys); |
||||||
|
// end of scan
|
||||||
|
if(swap == 0) |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); |
||||||
|
} |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); |
||||||
|
|
||||||
|
if(swap == 0) |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); |
||||||
|
} |
||||||
|
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads, |
||||||
|
args, -1, -1, build_opt_buf); |
||||||
|
swap = swap ? 0 : 1; |
||||||
|
} |
||||||
|
if(newBuffer) |
||||||
|
{ |
||||||
|
buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys); |
||||||
|
buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} /* radix_sort */ |
||||||
|
|
||||||
|
namespace merge_sort |
||||||
|
{ |
||||||
|
static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) |
||||||
|
{ |
||||||
|
Context * cxt = Context::getContext(); |
||||||
|
|
||||||
|
size_t globalThreads[3] = {vecSize, 1, 1}; |
||||||
|
size_t localThreads[3] = {GROUP_SIZE, 1, 1}; |
||||||
|
|
||||||
|
std::vector< std::pair<size_t, const void *> > args; |
||||||
|
char build_opt_buf [100]; |
||||||
|
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); |
||||||
|
|
||||||
|
String kernelname[] = {String("blockInsertionSort"), String("merge")}; |
||||||
|
int keylds_size = GROUP_SIZE * keys.elemSize(); |
||||||
|
int vallds_size = GROUP_SIZE * vals.elemSize(); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); |
||||||
|
args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize)); |
||||||
|
args.push_back(std::make_pair(keylds_size, (void*)NULL)); |
||||||
|
args.push_back(std::make_pair(vallds_size, (void*)NULL)); |
||||||
|
|
||||||
|
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||||
|
|
||||||
|
// Early exit for the case of no merge passes, values are already in destination vector
|
||||||
|
if(vecSize <= GROUP_SIZE) |
||||||
|
{ |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
// An odd number of elements requires an extra merge pass to sort
|
||||||
|
size_t numMerges = 0; |
||||||
|
// Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
|
||||||
|
// this is how many merge passes we want
|
||||||
|
size_t log2BlockSize = vecSize >> 6; |
||||||
|
for( ; log2BlockSize > 1; log2BlockSize >>= 1 ) |
||||||
|
{ |
||||||
|
++numMerges; |
||||||
|
} |
||||||
|
// Check to see if the input vector size is a power of 2, if not we will need last merge pass
|
||||||
|
numMerges += isSizePowerOf2(vecSize)? 1: 0; |
||||||
|
|
||||||
|
// Allocate a flipflop buffer because the merge passes are out of place
|
||||||
|
oclMat tmpKeyBuffer(keys.size(), keys.type()); |
||||||
|
oclMat tmpValBuffer(vals.size(), vals.type()); |
||||||
|
args.resize(8); |
||||||
|
|
||||||
|
args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize); |
||||||
|
args[6] = std::make_pair(keylds_size, (void*)NULL); |
||||||
|
args[7] = std::make_pair(vallds_size, (void*)NULL); |
||||||
|
|
||||||
|
for(size_t pass = 1; pass <= numMerges; ++pass ) |
||||||
|
{ |
||||||
|
// For each pass, flip the input-output buffers
|
||||||
|
if( pass & 0x1 ) |
||||||
|
{ |
||||||
|
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||||
|
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||||
|
args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); |
||||||
|
args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); |
||||||
|
args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); |
||||||
|
args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); |
||||||
|
args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); |
||||||
|
} |
||||||
|
// For each pass, the merge window doubles
|
||||||
|
unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) ); |
||||||
|
args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize); |
||||||
|
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf); |
||||||
|
} |
||||||
|
// If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy
|
||||||
|
// the results back into the input array
|
||||||
|
if( numMerges & 1 ) |
||||||
|
{ |
||||||
|
tmpKeyBuffer.copyTo(keys); |
||||||
|
tmpValBuffer.copyTo(vals); |
||||||
|
} |
||||||
|
} |
||||||
|
} /* merge_sort */ |
||||||
|
|
||||||
|
} |
||||||
|
} /* namespace cv { namespace ocl */ |
||||||
|
|
||||||
|
|
||||||
|
void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan) |
||||||
|
{ |
||||||
|
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||||
|
CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
|
||||||
|
CV_Assert( vecSize <= static_cast<size_t>(keys.cols) ); |
||||||
|
switch(method) |
||||||
|
{ |
||||||
|
case SORT_BITONIC: |
||||||
|
bitonic_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); |
||||||
|
break; |
||||||
|
case SORT_SELECTION: |
||||||
|
selection_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); |
||||||
|
break; |
||||||
|
case SORT_RADIX: |
||||||
|
radix_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); |
||||||
|
break; |
||||||
|
case SORT_MERGE: |
||||||
|
merge_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, int method, bool isGreaterThan) |
||||||
|
{ |
||||||
|
CV_Assert( keys.size() == vals.size() ); |
||||||
|
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||||
|
size_t vecSize = static_cast<size_t>(keys.cols); |
||||||
|
sort_by_key(keys, vals, vecSize, method, isGreaterThan); |
||||||
|
} |
@ -0,0 +1,244 @@ |
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// @Authors
|
||||||
|
// Peng Xiao, pengxiao@outlook.com
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other oclMaterials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors as is and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
#include <map> |
||||||
|
#include <functional> |
||||||
|
#include "precomp.hpp" |
||||||
|
|
||||||
|
using namespace std; |
||||||
|
using namespace cvtest; |
||||||
|
using namespace testing; |
||||||
|
using namespace cv; |
||||||
|
|
||||||
|
|
||||||
|
namespace |
||||||
|
{ |
||||||
|
IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool) |
||||||
|
IMPLEMENT_PARAM_CLASS(InputSize, int) |
||||||
|
IMPLEMENT_PARAM_CLASS(SortMethod, int) |
||||||
|
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
struct KV_CVTYPE{ static int toType() {return 0;} }; |
||||||
|
|
||||||
|
template<> struct KV_CVTYPE<int> { static int toType() {return CV_32SC1;} }; |
||||||
|
template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} }; |
||||||
|
template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} }; |
||||||
|
template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} }; |
||||||
|
|
||||||
|
template<class key_type, class val_type> |
||||||
|
bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2) |
||||||
|
{ |
||||||
|
return p1.first > p2.first; |
||||||
|
} |
||||||
|
|
||||||
|
template<class key_type, class val_type> |
||||||
|
bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2) |
||||||
|
{ |
||||||
|
return p1.first < p2.first; |
||||||
|
} |
||||||
|
|
||||||
|
template<class key_type, class val_type> |
||||||
|
void toKVPair( |
||||||
|
MatConstIterator_<key_type> kit, |
||||||
|
MatConstIterator_<val_type> vit, |
||||||
|
int vecSize, |
||||||
|
vector<pair<key_type, val_type> >& kvres |
||||||
|
) |
||||||
|
{ |
||||||
|
kvres.clear(); |
||||||
|
for(int i = 0; i < vecSize; i ++) |
||||||
|
{ |
||||||
|
kvres.push_back(make_pair(*kit, *vit)); |
||||||
|
++kit; |
||||||
|
++vit; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
template<class key_type, class val_type> |
||||||
|
void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false) |
||||||
|
{ |
||||||
|
vector<pair<key_type, val_type> > kvres; |
||||||
|
toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres); |
||||||
|
|
||||||
|
if(isGreater) |
||||||
|
{ |
||||||
|
std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>); |
||||||
|
} |
||||||
|
key_type * kptr = keys.ptr<key_type>(); |
||||||
|
val_type * vptr = vals.ptr<val_type>(); |
||||||
|
for(int i = 0; i < keys.cols; i ++) |
||||||
|
{ |
||||||
|
kptr[i] = kvres[i].first; |
||||||
|
vptr[i] = kvres[i].second; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
class SortByKey_STL |
||||||
|
{ |
||||||
|
public: |
||||||
|
static void sort(cv::Mat&, cv::Mat&, bool is_gt); |
||||||
|
private: |
||||||
|
typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool); |
||||||
|
SortByKey_STL(); |
||||||
|
quick_sorter quick_sorters[CV_64FC4][CV_64FC4]; |
||||||
|
static SortByKey_STL instance; |
||||||
|
}; |
||||||
|
|
||||||
|
SortByKey_STL SortByKey_STL::instance = SortByKey_STL(); |
||||||
|
|
||||||
|
SortByKey_STL::SortByKey_STL() |
||||||
|
{ |
||||||
|
memset(instance.quick_sorters, 0, sizeof(quick_sorters)); |
||||||
|
#define NEW_SORTER(KT, VT) \ |
||||||
|
instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>; |
||||||
|
|
||||||
|
NEW_SORTER(int, int); |
||||||
|
NEW_SORTER(int, Vec2i); |
||||||
|
NEW_SORTER(int, float); |
||||||
|
NEW_SORTER(int, Vec2f); |
||||||
|
|
||||||
|
NEW_SORTER(float, int); |
||||||
|
NEW_SORTER(float, Vec2i); |
||||||
|
NEW_SORTER(float, float); |
||||||
|
NEW_SORTER(float, Vec2f); |
||||||
|
#undef NEW_SORTER |
||||||
|
} |
||||||
|
|
||||||
|
void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt) |
||||||
|
{ |
||||||
|
instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt); |
||||||
|
} |
||||||
|
|
||||||
|
bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_, |
||||||
|
const Mat& /*dkeys_*/, const Mat& dvals_) |
||||||
|
{ |
||||||
|
int cn_val = gvals_.channels(); |
||||||
|
int count = gkeys_.cols; |
||||||
|
|
||||||
|
//for convenience we convert depth to float and channels to 1
|
||||||
|
Mat gkeys, gvals, dkeys, dvals; |
||||||
|
gkeys_.reshape(1).convertTo(gkeys, CV_32F); |
||||||
|
gvals_.reshape(1).convertTo(gvals, CV_32F); |
||||||
|
//dkeys_.reshape(1).convertTo(dkeys, CV_32F);
|
||||||
|
dvals_.reshape(1).convertTo(dvals, CV_32F); |
||||||
|
float * gkptr = gkeys.ptr<float>(); |
||||||
|
float * gvptr = gvals.ptr<float>(); |
||||||
|
//float * dkptr = dkeys.ptr<float>();
|
||||||
|
float * dvptr = dvals.ptr<float>(); |
||||||
|
|
||||||
|
for(int i = 0; i < count - 1; ++i) |
||||||
|
{ |
||||||
|
int iden_count = 0; |
||||||
|
// firstly calculate the number of identical keys
|
||||||
|
while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count]) |
||||||
|
{ |
||||||
|
++ iden_count; |
||||||
|
} |
||||||
|
|
||||||
|
// sort dv and gv
|
||||||
|
int num_of_val = (iden_count + 1) * cn_val; |
||||||
|
std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val); |
||||||
|
std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val); |
||||||
|
|
||||||
|
// then check if [i, i + iden_count) is the same
|
||||||
|
for(int j = 0; j < num_of_val; ++j) |
||||||
|
{ |
||||||
|
if(gvptr[i + j] != dvptr[i + j]) |
||||||
|
{ |
||||||
|
return false; |
||||||
|
} |
||||||
|
} |
||||||
|
i += iden_count; |
||||||
|
} |
||||||
|
return true; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#define INPUT_SIZES Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
|
||||||
|
#define KEY_TYPES Values(MatType(CV_32SC1), MatType(CV_32FC1)) |
||||||
|
#define VAL_TYPES Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2)) |
||||||
|
#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/) |
||||||
|
#define F_OR_T Values(IsGreaterThan(false), IsGreaterThan(true)) |
||||||
|
|
||||||
|
PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan) |
||||||
|
{ |
||||||
|
InputSize input_size; |
||||||
|
MatType key_type, val_type; |
||||||
|
SortMethod method; |
||||||
|
IsGreaterThan is_gt; |
||||||
|
|
||||||
|
Mat mat_key, mat_val; |
||||||
|
virtual void SetUp() |
||||||
|
{ |
||||||
|
input_size = GET_PARAM(0); |
||||||
|
key_type = GET_PARAM(1); |
||||||
|
val_type = GET_PARAM(2); |
||||||
|
method = GET_PARAM(3); |
||||||
|
is_gt = GET_PARAM(4); |
||||||
|
|
||||||
|
using namespace cv; |
||||||
|
// fill key and val
|
||||||
|
mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX); |
||||||
|
mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
TEST_P(SortByKey, Accuracy) |
||||||
|
{ |
||||||
|
using namespace cv; |
||||||
|
ocl::oclMat oclmat_key(mat_key); |
||||||
|
ocl::oclMat oclmat_val(mat_val); |
||||||
|
|
||||||
|
ocl::sort_by_key(oclmat_key, oclmat_val, method, is_gt); |
||||||
|
SortByKey_STL::sort(mat_key, mat_val, is_gt); |
||||||
|
|
||||||
|
EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0); |
||||||
|
EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val)); |
||||||
|
} |
||||||
|
INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T)); |
Loading…
Reference in new issue