Merge pull request #1189 from pengx17:2.4_sort_by_key

12 years ago · 95bdd4b670
parent a7da129984 8b90cb375a
commit 95bdd4b670
7 changed files with 1477 additions and 1 deletions
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@ -481,4 +481,40 @@ Performs generalized matrix multiplication.
            * **GEMM_1_T** transpose  ``src1``
            * **GEMM_2_T** transpose  ``src2``
-.. seealso:: :ocv:func:`gemm`
+.. seealso:: :ocv:func:`gemm`
 ocl::sortByKey
 ------------------
 Returns void
 .. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)
    :param keys:   The keys to be used as sorting indices.
    :param values: The array of values.
    :param isGreaterThan: Determine sorting order.
    :param method: supported sorting methods:
            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
            * **SORT_MERGE**     merge sort
            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
 Returns the sorted result of all the elements in values based on equivalent keys.
 The element unit in the values to be sorted is determined from the data type, 
 i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
 Both keys and values will be sorted inplace. 
 Keys needs to be a **single** channel `oclMat`.
 Example::
    input -
    keys   = {2,    3,   1}   (CV_8UC1)
    values = {10,5, 4,3, 6,2} (CV_8UC2)
    sortByKey(keys, values, SORT_SELECTION, false);
    output -
    keys   = {1,    2,   3}   (CV_8UC1)
    values = {6,2, 10,5, 4,3} (CV_8UC2)
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -1673,6 +1673,31 @@ namespace cv
            oclMat diff_buf;
            oclMat norm_buf;
        };
        // current supported sorting methods
        enum
        {
            SORT_BITONIC,   // only support power-of-2 buffer size
            SORT_SELECTION, // cannot sort duplicate keys
            SORT_MERGE,
            SORT_RADIX      // only support signed int/float keys(CV_32S/CV_32F)
        };
        //! Returns the sorted result of all the elements in input based on equivalent keys.
        //
        //  The element unit in the values to be sorted is determined from the data type, 
        //  i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
        //  matrix dimension.
        //  both keys and values will be sorted inplace
        //  Key needs to be single channel oclMat.
        //
        //  Example:
        //  input -
        //    keys   = {2,    3,   1}   (CV_8UC1)
        //    values = {10,5, 4,3, 6,2} (CV_8UC2)
        //  sortByKey(keys, values, SORT_SELECTION, false);
        //  output -
        //    keys   = {1,    2,   3}   (CV_8UC1)
        //    values = {6,2, 10,5, 4,3} (CV_8UC2)
        void CV_EXPORTS sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
--- a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
@ -0,0 +1,176 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 
 #ifndef N   // number of radices
 #define N 4
 #endif
 #ifndef K_T
 #define K_T float
 #endif
 #ifndef V_T
 #define V_T float
 #endif
 #ifndef IS_GT
 #define IS_GT 0
 #endif
 // from Thrust::b40c, link:
 // https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
 __inline uint convertKey(uint converted_key)
 {
 #ifdef K_FLT
    unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
    converted_key ^= mask;
 #elif defined(K_INT)
    const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
    converted_key ^= SIGN_MASK;	
 #else
 #endif
    return converted_key;
 }
 //FIXME(pengx17): 
 // exclusive scan, need to be optimized as this is too naive...
 kernel
    void naiveScanAddition(
    __global int * input,
    __global int * output,
    int size
    )
 {
    if(get_global_id(0) == 0)
    {
        output[0] = 0;
        for(int i = 1; i < size; i ++)
        {
            output[i] = output[i - 1] + input[i - 1];
        }
    }
 }
 // following is ported from
 // https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
 kernel
    void histogramRadixN (
    __global K_T* unsortedKeys,
    __global int * buckets,
    uint shiftCount
    )
 {
    const int RADIX_T     = N;
    const int RADICES_T   = (1 << RADIX_T);
    const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; 
    const int MASK_T      = (1 << RADIX_T) - 1;
    int localBuckets[16] = {0,0,0,0,0,0,0,0,
                            0,0,0,0,0,0,0,0};
    int globalId    = get_global_id(0);
    int numOfGroups = get_num_groups(0);
    /* Calculate thread-histograms */
    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
    {
        uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
        value = (value >> shiftCount) & MASK_T;
 #if IS_GT
        localBuckets[RADICES_T - value - 1]++;
 #else
        localBuckets[value]++;
 #endif
    }
    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
    {
        buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
    }
 }
 kernel
    void permuteRadixN (
    __global K_T*  unsortedKeys,
    __global V_T*  unsortedVals,
    __global int* scanedBuckets,
    uint shiftCount,
    __global K_T*  sortedKeys,
    __global V_T*  sortedVals
    )
 {
    const int RADIX_T     = N;
    const int RADICES_T   = (1 << RADIX_T);
    const int MASK_T = (1<<RADIX_T)  -1;
    int globalId  = get_global_id(0);
    int numOfGroups = get_num_groups(0);
    const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
    int  localIndex[16];
    /*Load the index to local memory*/
    for(int i = 0; i < RADICES_T; ++i)
    {
 #if IS_GT
        localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
 #else
        localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
 #endif
    }
    /* Permute elements to appropriate location */
    for(int i = 0; i < RADICES_T; ++i)
    {
        int old_idx = mad24(globalId, RADICES_T, i);
        K_T  ovalue = unsortedKeys[old_idx];
        uint value = convertKey(as_uint(ovalue));
        uint maskedValue = (value >> shiftCount) & MASK_T;
        uint index = localIndex[maskedValue];
        sortedKeys[index] = ovalue;
        sortedVals[index] = unsortedVals[old_idx];
        localIndex[maskedValue] = index + 1;
    }
 }
--- a/modules/ocl/src/opencl/kernel_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl
@ -0,0 +1,245 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #ifndef K_T
 #define K_T float
 #endif
 #ifndef V_T
 #define V_T float
 #endif
 #ifndef IS_GT
 #define IS_GT false
 #endif
 #if IS_GT
 #define my_comp(x,y) ((x) > (y))
 #else
 #define my_comp(x,y) ((x) < (y))
 #endif
 /////////////////////// Bitonic sort ////////////////////////////
 // ported from 
 // https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
 __kernel
    void bitonicSort
    (
        __global K_T * keys,
        __global V_T * vals,
        int count,
        int stage,
        int passOfStage
    )
 {
    const int threadId = get_global_id(0);
    if(threadId >= count / 2)
    {
        return;
    }
    const int pairDistance = 1 << (stage - passOfStage);
    const int blockWidth   = 2 * pairDistance;
    int leftId = min( (threadId % pairDistance) 
                   + (threadId / pairDistance) * blockWidth, count );
    int rightId = min( leftId + pairDistance, count );
    int temp;
    const V_T lval = vals[leftId];
    const V_T rval = vals[rightId]; 
    const K_T lkey = keys[leftId];
    const K_T rkey = keys[rightId];
    int sameDirectionBlockWidth = 1 << stage;
    if((threadId/sameDirectionBlockWidth) % 2 == 1)
    {
        temp = rightId;
        rightId = leftId;
        leftId = temp;
    }
    const bool compareResult = my_comp(lkey, rkey);
    if(compareResult)
    {
        keys[rightId] = rkey;
        keys[leftId]  = lkey;
        vals[rightId] = rval;
        vals[leftId]  = lval;
    }
    else
    {
        keys[rightId] = lkey;
        keys[leftId]  = rkey;
        vals[rightId] = lval;
        vals[leftId]  = rval;
    }
 }
 /////////////////////// Selection sort ////////////////////////////
 //kernel is ported from Bolt library:
 //https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
 __kernel
    void selectionSortLocal
    (
        __global K_T * keys,
        __global V_T * vals,
        const int count,
        __local  K_T * scratch
    )
 {
    int          i  = get_local_id(0); // index in workgroup
    int numOfGroups = get_num_groups(0); // index in workgroup
    int groupID     = get_group_id(0);
    int         wg  = get_local_size(0); // workgroup size = block size
    int n; // number of elements to be processed for this work group
    int offset   = groupID * wg;
    int same     = 0;
    vals      += offset;
    keys      += offset;
    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
    int clamped_i= min(i, n - 1);
    K_T key1 = keys[clamped_i], key2;
    V_T val1 = vals[clamped_i];
    scratch[i] = key1;
    barrier(CLK_LOCAL_MEM_FENCE);
    if(i >= n)
    {
        return;
    }
    int pos = 0;
    for (int j=0;j<n;++j)
    {
        key2  = scratch[j];
        if(my_comp(key2, key1)) 
            pos++;//calculate the rank of this element in this work group
        else 
        {
            if(my_comp(key1, key2))
                continue;
            else 
            {
                // key1 and key2 are same
                same++;
            }
        }
    }
    for (int j=0; j< same; j++)
    {
        vals[pos + j] = val1;
        keys[pos + j] = key1;
    }
 }
 __kernel
    void selectionSortFinal
    (
        __global K_T * keys,
        __global V_T * vals,
        const int count
    )
 {
    const int          i  = get_local_id(0); // index in workgroup
    const int numOfGroups = get_num_groups(0); // index in workgroup
    const int groupID     = get_group_id(0);
    const int         wg  = get_local_size(0); // workgroup size = block size
    int pos = 0, same = 0;
    const int offset = get_group_id(0) * wg;
    const int remainder = count - wg*(numOfGroups-1);
    if((offset + i ) >= count)
        return;
    V_T val1 = vals[offset + i];
    K_T key1 = keys[offset + i];
    K_T key2;
    for(int j=0; j<numOfGroups-1; j++ )
    {
        for(int k=0; k<wg; k++)
        {
            key2 = keys[j*wg + k]; 
            if(my_comp(key1, key2))
                break;
            else
            {
                //Increment only if the value is not the same. 
                if(my_comp(key2, key1))
                    pos++;
                else 
                    same++;
            }
        }
    }
    for(int k=0; k<remainder; k++)
    {
        key2 = keys[(numOfGroups-1)*wg + k]; 
        if(my_comp(key1, key2))
            break;
        else
        {
            //Don't increment if the value is the same. 
            if(my_comp(key2, key1))
                pos++;
            else 
                same++;
        }
    }  
    for (int j=0; j< same; j++)
    {
        vals[pos + j] = val1;
        keys[pos + j] = key1;
    }
 }
--- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
@ -0,0 +1,296 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #ifndef K_T
 #define K_T float
 #endif
 #ifndef V_T
 #define V_T float
 #endif
 #ifndef IS_GT
 #define IS_GT false
 #endif
 #if IS_GT
 #define my_comp(x,y) ((x) > (y))
 #else
 #define my_comp(x,y) ((x) < (y))
 #endif
 ///////////// parallel merge sort ///////////////
 // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
 uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
 {
    //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
    uint firstIndex = left;
    uint lastIndex = right;
    //  This loops through [firstIndex, lastIndex)
    //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
    //  this while loop will be divergent within a wavefront
    while( firstIndex < lastIndex )
    {
        K_T dataVal = data[ firstIndex ];
        //  This branch will create divergent wavefronts
        if( my_comp( dataVal, searchVal ) )
        {
            firstIndex = firstIndex+1;
        }
        else
        {
            break;
        }
    }
    return firstIndex;
 }
 //  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
 //  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
 //  passed as a functor parameter my_comp
 //  This function returns an index that is the first index whos value would be equal to the searched value
 uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 {
    //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
    uint firstIndex = left;
    uint lastIndex = right;
    //  This loops through [firstIndex, lastIndex)
    //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
    //  this while loop will be divergent within a wavefront
    while( firstIndex < lastIndex )
    {
        //  midIndex is the average of first and last, rounded down
        uint midIndex = ( firstIndex + lastIndex ) / 2;
        K_T midValue = data[ midIndex ];
        //  This branch will create divergent wavefronts
        if( my_comp( midValue, searchVal ) )
        {
            firstIndex = midIndex+1;
            // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
        }
        else
        {
            lastIndex = midIndex;
            // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
        }
    }
    return firstIndex;
 }
 //  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
 //  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
 //  passed as a functor parameter my_comp
 //  This function returns an index that is the first index whos value would be greater than the searched value
 //  If the search value is not found in the sequence, upperbound returns the same result as lowerbound
 uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 {
    uint upperBound = lowerBoundBinary( data, left, right, searchVal );
    // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
    //  If upperBound == right, then  searchVal was not found in the sequence.  Just return.
    if( upperBound != right )
    {
        //  While the values are equal i.e. !(x < y) && !(y < x) increment the index
        K_T upperValue = data[ upperBound ];
        while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
        {
            upperBound++;
            upperValue = data[ upperBound ];
        }
    }
    return upperBound;
 }
 //  This kernel implements merging of blocks of sorted data.  The input to this kernel most likely is
 //  the output of blockInsertionSortTemplate.  It is expected that the source array contains multiple
 //  blocks, each block is independently sorted.  The goal is to write into the output buffer half as
 //  many blocks, of double the size.  The even and odd blocks are stably merged together to form
 //  a new sorted block of twice the size.  The algorithm is out-of-place.
 kernel void merge(
    global K_T*   iKey_ptr,
    global V_T*   iValue_ptr,
    global K_T*   oKey_ptr,
    global V_T*   oValue_ptr,
    const uint    srcVecSize,
    const uint    srcLogicalBlockSize,
    local K_T*    key_lds,
    local V_T*    val_lds
 )
 {
    size_t globalID     = get_global_id( 0 );
    size_t groupID      = get_group_id( 0 );
    size_t localID      = get_local_id( 0 );
    size_t wgSize       = get_local_size( 0 );
    //  Abort threads that are passed the end of the input vector
    if( globalID >= srcVecSize )
        return; // on SI this doesn't mess-up barriers
    //  For an element in sequence A, find the lowerbound index for it in sequence B
    uint srcBlockNum   = globalID / srcLogicalBlockSize;
    uint srcBlockIndex = globalID % srcLogicalBlockSize;
    // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
    //  Pairs of even-odd blocks will be merged together
    //  An even block should search for an insertion point in the next odd block,
    //  and the odd block should look for an insertion point in the corresponding previous even block
    uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
    uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
    leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
    leftBlockIndex = min( leftBlockIndex, srcVecSize );
    uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
    // if( localID == 0 )
    // {
    // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
    // }
    //  For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
    // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
    uint insertionIndex = 0;
    if( (srcBlockNum & 0x1) == 0 )
    {
        insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
    }
    else
    {
        insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
    }
    //  The index of an element in the result sequence is the summation of it's indixes in the two input
    //  sequences
    uint dstBlockIndex = srcBlockIndex + insertionIndex;
    uint dstBlockNum = srcBlockNum/2;
    // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
    // {
    // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
    // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
    // }
    oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
    oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
    // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
 }
 kernel void blockInsertionSort(
    global K_T*   key_ptr,
    global V_T*   value_ptr,
    const uint    vecSize,
    local K_T*    key_lds,
    local V_T*    val_lds
 )
 {
    size_t gloId    = get_global_id( 0 );
    size_t groId    = get_group_id( 0 );
    size_t locId    = get_local_id( 0 );
    size_t wgSize   = get_local_size( 0 );
    bool in_range = gloId < vecSize;
    K_T key;
    V_T val;
    //  Abort threads that are passed the end of the input vector
    if (in_range)
    {
        //  Make a copy of the entire input array into fast local memory
        key = key_ptr[ gloId ];
        val = value_ptr[ gloId ];
        key_lds[ locId ] = key;
        val_lds[ locId ] = val;
    }
    barrier( CLK_LOCAL_MEM_FENCE );
    //  Sorts a workgroup using a naive insertion sort
    //  The sort uses one thread within a workgroup to sort the entire workgroup
    if( locId == 0 && in_range )
    {
        //  The last workgroup may have an irregular size, so we calculate a per-block endIndex
        //  endIndex is essentially emulating a mod operator with subtraction and multiply
        size_t endIndex = vecSize - ( groId * wgSize );
        endIndex = min( endIndex, wgSize );
        // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
        //  Indices are signed because the while loop will generate a -1 index inside of the max function
        for( int currIndex = 1; currIndex < endIndex; ++currIndex )
        {
            key = key_lds[ currIndex ];
            val = val_lds[ currIndex ];
            int scanIndex = currIndex;
            K_T ldsKey = key_lds[scanIndex - 1];
            while( scanIndex > 0 && my_comp( key, ldsKey ) )
            {
                V_T ldsVal = val_lds[scanIndex - 1];
                //  If the keys are being swapped, make sure the values are swapped identicaly
                key_lds[ scanIndex ] = ldsKey;
                val_lds[ scanIndex ] = ldsVal;
                scanIndex = scanIndex - 1;
                ldsKey = key_lds[ max( 0, scanIndex - 1 ) ];  // scanIndex-1 may be -1
            }
            key_lds[ scanIndex ] = key;
            val_lds[ scanIndex ] = val;
        }
    }
    barrier( CLK_LOCAL_MEM_FENCE );
    if(in_range)
    {
        key = key_lds[ locId ];
        key_ptr[ gloId ] = key;
        val = val_lds[ locId ];
        value_ptr[ gloId ] = val;
    }
 }
 ///////////// Radix sort from b40c library /////////////
--- a/modules/ocl/src/sort_by_key.cpp
+++ b/modules/ocl/src/sort_by_key.cpp
@ -0,0 +1,454 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <iomanip>
 #include "precomp.hpp"
 namespace cv
 {
 namespace ocl
 {
 extern const char * kernel_sort_by_key;
 extern const char * kernel_stablesort_by_key;
 extern const char * kernel_radix_sort_by_key;
 void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
 //TODO(pengx17): change this value depending on device other than a constant
 const static unsigned int GROUP_SIZE = 256;
 const char * depth_strings[] =
 {
    "uchar",   //CV_8U
    "char",    //CV_8S
    "ushort",  //CV_16U
    "short",   //CV_16S
    "int",     //CV_32S
    "float",   //CV_32F
    "double"   //CV_64F
 };
 void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
 {
    sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
            isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
    if(vals.oclchannels() > 1)
    {
        sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
    }
 }
 inline bool isSizePowerOf2(size_t size)
 {
    return ((size - 1) & (size)) == 0;
 }
 namespace bitonic_sort
 {
 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
 {
    CV_Assert(isSizePowerOf2(vecSize));
    Context * cxt = Context::getContext();
    size_t globalThreads[3] = {vecSize / 2, 1, 1};
    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
    // 2^numStages should be equal to vecSize or the output is invalid
    int numStages = 0;
    for(int i = vecSize; i > 1; i >>= 1)
    {
        ++numStages;
    }
    char build_opt_buf [100];
    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
    const int argc = 5;
    std::vector< std::pair<size_t, const void *> > args(argc);
    String kernelname = "bitonicSort";
    args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
    args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
    args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
    for(int stage = 0; stage < numStages; ++stage)
    {
        args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
        {
            args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
            openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
        }
    }
 }
 }  /* bitonic_sort */
 namespace selection_sort
 {
 // FIXME:
 // This function cannot sort arrays with duplicated keys
 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
 {
    CV_Error(-1, "This function is incorrect at the moment.");
    Context * cxt = Context::getContext();
    size_t globalThreads[3] = {vecSize, 1, 1};
    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
    std::vector< std::pair<size_t, const void *> > args;
    char build_opt_buf [100];
    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
    //local
    String kernelname = "selectionSortLocal";
    int lds_size = GROUP_SIZE * keys.elemSize();
    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
    args.push_back(std::make_pair(lds_size,       (void*)NULL));
    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
    //final
    kernelname = "selectionSortFinal";
    args.pop_back();
    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
 }
 }  /* selection_sort */
 namespace radix_sort
 {
 //FIXME(pengx17): 
 // exclusive scan, need to be optimized as this is too naive...
 //void naive_scan_addition(oclMat& input, oclMat& output)
 //{
 //    Context * cxt = Context::getContext();
 //    size_t vecSize = input.cols;
 //    size_t globalThreads[3] = {1, 1, 1};
 //    size_t localThreads[3]  = {1, 1, 1};
 //
 //    String kernelname = "naiveScanAddition";
 //
 //    std::vector< std::pair<size_t, const void *> > args;
 //    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
 //    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
 //    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
 //    openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
 //}
 void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
 {
    Mat m_input = input, m_output(output.size(), output.type());
    MatIterator_<int> i_mit = m_input.begin<int>();
    MatIterator_<int> o_mit = m_output.begin<int>();
    *o_mit = 0;
    ++i_mit;
    ++o_mit;
    for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
    {
        *o_mit = *(o_mit - 1) + *(i_mit - 1);
    }
    output = m_output;
 }
 //radix sort ported from Bolt
 static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
 {
    CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
    bool isKeyFloat = keys.type() == CV_32F;
    const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
                         //local array of 16 elements in the histogram kernel.
    const int RADICES = (1 << RADIX); //Values handeled by each work-item?
    bool  newBuffer = false;
    size_t vecSize = origVecSize;
    unsigned int groupSize  = RADICES;
    size_t mulFactor = groupSize * RADICES;
    oclMat buffer_keys, buffer_vals;
    if(origVecSize % mulFactor != 0)
    {
        vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
        buffer_keys.create(1, vecSize, keys.type());
        buffer_vals.create(1, vecSize, vals.type());
        Scalar padding_value;
        oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
        if(isGreaterThan)
        {
            switch(buffer_keys.depth())
            {
            case CV_32F:
                padding_value = Scalar::all(-FLT_MAX);
                break;
            case CV_32S:
                padding_value = Scalar::all(INT_MIN);
                break;
            }
        }
        else
        {
            switch(buffer_keys.depth())
            {
            case CV_32F:
                padding_value = Scalar::all(FLT_MAX);
                break;
            case CV_32S:
                padding_value = Scalar::all(INT_MAX);
                break;
            }
        }
        ocl::copyMakeBorder(
            keys(Rect(0,0,origVecSize,1)), buffer_keys, 
            0, 0, 0, vecSize - origVecSize, 
            BORDER_CONSTANT, padding_value);
        vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
        newBuffer = true;
    }
    else
    {
        buffer_keys = keys;
        buffer_vals = vals;
        newBuffer = false;
    }
    oclMat swap_input_keys(1, vecSize, keys.type());
    oclMat swap_input_vals(1, vecSize, vals.type());
    oclMat hist_bin_keys(1, vecSize, CV_32SC1);
    oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
    Context * cxt = Context::getContext();
    size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
    size_t localThreads[3]  = {groupSize, 1, 1};
    std::vector< std::pair<size_t, const void *> > args;
    char build_opt_buf [100];
    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
    //additional build option for radix sort
    sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT"); 
    String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
    int swap = 0;
    for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
    {
        args.clear();
        //Do a histogram pass locally
        if(swap == 0)
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
        }
        else
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
        }
        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
            args, -1, -1, build_opt_buf);
        args.clear();
        //Perform a global scan
        naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
        // end of scan
        if(swap == 0)
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
        }
        else
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
        }
        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
        if(swap == 0)
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
        }
        else
        {
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
        }
        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
            args, -1, -1, build_opt_buf);
        swap = swap ? 0 : 1;
    }
    if(newBuffer)
    {
        buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
        buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
    }
 }
 }  /* radix_sort */
 namespace merge_sort
 {
 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
 {
    Context * cxt = Context::getContext();
    size_t globalThreads[3] = {vecSize, 1, 1};
    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
    std::vector< std::pair<size_t, const void *> > args;
    char build_opt_buf [100];
    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
    String kernelname[] = {String("blockInsertionSort"), String("merge")};
    int keylds_size = GROUP_SIZE * keys.elemSize();
    int vallds_size = GROUP_SIZE * vals.elemSize();
    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&keys.data));
    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&vals.data));
    args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
    args.push_back(std::make_pair(keylds_size,     (void*)NULL));
    args.push_back(std::make_pair(vallds_size,     (void*)NULL));
    openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
    //  Early exit for the case of no merge passes, values are already in destination vector
    if(vecSize <= GROUP_SIZE)
    {
        return;
    }
    //  An odd number of elements requires an extra merge pass to sort
    size_t numMerges = 0;
    //  Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
    //  this is how many merge passes we want
    size_t log2BlockSize = vecSize >> 6;
    for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
    {
        ++numMerges;
    }
    //  Check to see if the input vector size is a power of 2, if not we will need last merge pass
    numMerges += isSizePowerOf2(vecSize)? 1: 0;
    //  Allocate a flipflop buffer because the merge passes are out of place
    oclMat tmpKeyBuffer(keys.size(), keys.type());
    oclMat tmpValBuffer(vals.size(), vals.type());
    args.resize(8);
    args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
    args[6] = std::make_pair(keylds_size,    (void*)NULL);
    args[7] = std::make_pair(vallds_size,    (void*)NULL);
    for(size_t pass = 1; pass <= numMerges; ++pass )
    {
        //  For each pass, flip the input-output buffers
        if( pass & 0x1 )
        {
            args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
            args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
            args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
            args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
        }
        else
        {
            args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
            args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
            args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
            args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
        }
        //  For each pass, the merge window doubles
        unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
        args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
        openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
    }
    //  If there are an odd number of merges, then the output data is sitting in the temp buffer.  We need to copy
    //  the results back into the input array
    if( numMerges & 1 )
    {
        tmpKeyBuffer.copyTo(keys);
        tmpValBuffer.copyTo(vals);
    }
 }
 }  /* merge_sort */
 }
 } /* namespace cv { namespace ocl */
 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
 {
    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
    CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
    CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
    switch(method)
    {
    case SORT_BITONIC:
        bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
        break;
    case SORT_SELECTION:
        selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
        break;
    case SORT_RADIX:
        radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
        break;
    case SORT_MERGE:
        merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
        break;
    }
 }
 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
 {
    CV_Assert( keys.size() == vals.size() );
    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
    size_t vecSize = static_cast<size_t>(keys.cols);
    sortByKey(keys, vals, vecSize, method, isGreaterThan);
 }
--- a/modules/ocl/test/test_sort.cpp
+++ b/modules/ocl/test/test_sort.cpp
@ -0,0 +1,244 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <map>
 #include <functional>
 #include "precomp.hpp"
 using namespace std;
 using namespace cvtest;
 using namespace testing;
 using namespace cv;
 namespace
 {
 IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool)
 IMPLEMENT_PARAM_CLASS(InputSize, int)
 IMPLEMENT_PARAM_CLASS(SortMethod, int)
 template<class T> 
 struct KV_CVTYPE{ static int toType() {return 0;} };
 template<> struct KV_CVTYPE<int>  { static int toType() {return CV_32SC1;} };
 template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} };
 template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} };
 template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} };
 template<class key_type, class val_type>
 bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
 {
    return p1.first > p2.first;
 }
 template<class key_type, class val_type>
 bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
 {
    return p1.first < p2.first;
 }
 template<class key_type, class val_type>
 void toKVPair(
    MatConstIterator_<key_type> kit,
    MatConstIterator_<val_type> vit,
    int vecSize,
    vector<pair<key_type, val_type> >& kvres
    )
 {
    kvres.clear();
    for(int i = 0; i < vecSize; i ++)
    {
        kvres.push_back(make_pair(*kit, *vit));
        ++kit;
        ++vit;
    }
 }
 template<class key_type, class val_type>
 void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false)
 {
    vector<pair<key_type, val_type> > kvres;
    toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres);
    if(isGreater)
    {
        std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>);
    }
    else
    {
        std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>);
    }
    key_type * kptr = keys.ptr<key_type>();
    val_type * vptr = vals.ptr<val_type>();
    for(int i = 0; i < keys.cols; i ++)
    {
        kptr[i] = kvres[i].first;
        vptr[i] = kvres[i].second;
    }
 }
 class SortByKey_STL
 {
 public:
    static void sort(cv::Mat&, cv::Mat&, bool is_gt);
 private:
    typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool);
    SortByKey_STL();
    quick_sorter quick_sorters[CV_64FC4][CV_64FC4];
    static SortByKey_STL instance;
 };
 SortByKey_STL SortByKey_STL::instance = SortByKey_STL();
 SortByKey_STL::SortByKey_STL()
 {
    memset(instance.quick_sorters, 0, sizeof(quick_sorters));
 #define NEW_SORTER(KT, VT) \
    instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>;
    NEW_SORTER(int, int);
    NEW_SORTER(int, Vec2i);
    NEW_SORTER(int, float);
    NEW_SORTER(int, Vec2f);
    NEW_SORTER(float, int);
    NEW_SORTER(float, Vec2i);
    NEW_SORTER(float, float);
    NEW_SORTER(float, Vec2f);
 #undef NEW_SORTER
 }
 void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt)
 {
    instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt);
 }
 bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_,
                               const Mat& /*dkeys_*/, const Mat& dvals_)
 {
    int cn_val = gvals_.channels();
    int count  = gkeys_.cols;
    //for convenience we convert depth to float and channels to 1
    Mat gkeys, gvals, dkeys, dvals;
    gkeys_.reshape(1).convertTo(gkeys, CV_32F);
    gvals_.reshape(1).convertTo(gvals, CV_32F);
    //dkeys_.reshape(1).convertTo(dkeys, CV_32F);
    dvals_.reshape(1).convertTo(dvals, CV_32F);
    float * gkptr = gkeys.ptr<float>();
    float * gvptr = gvals.ptr<float>();
    //float * dkptr = dkeys.ptr<float>();
    float * dvptr = dvals.ptr<float>();
    for(int i = 0; i < count - 1; ++i)
    {
        int iden_count = 0;
        // firstly calculate the number of identical keys
        while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count])
        {
            ++ iden_count;
        }
        // sort dv and gv
        int num_of_val = (iden_count + 1) * cn_val;
        std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val);
        std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val);
        // then check if [i, i + iden_count) is the same
        for(int j = 0; j < num_of_val; ++j)
        {
            if(gvptr[i + j] != dvptr[i + j])
            {
                return false;
            }
        }
        i += iden_count;
    }
    return true;
 }
 }
 #define INPUT_SIZES  Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
 #define KEY_TYPES    Values(MatType(CV_32SC1), MatType(CV_32FC1))
 #define VAL_TYPES    Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2))
 #define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/)
 #define F_OR_T       Values(IsGreaterThan(false), IsGreaterThan(true))
 PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan)
 {
    InputSize input_size;
    MatType key_type, val_type;
    SortMethod method;
    IsGreaterThan is_gt;
    Mat mat_key, mat_val;
    virtual void SetUp()
    {
        input_size = GET_PARAM(0);
        key_type   = GET_PARAM(1);
        val_type   = GET_PARAM(2);
        method     = GET_PARAM(3);
        is_gt      = GET_PARAM(4);
        using namespace cv;
        // fill key and val
        mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX);
        mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX);
    }
 };
 TEST_P(SortByKey, Accuracy)
 {
    using namespace cv;
    ocl::oclMat oclmat_key(mat_key);
    ocl::oclMat oclmat_val(mat_val);
    ocl::sortByKey(oclmat_key, oclmat_val, method, is_gt);
    SortByKey_STL::sort(mat_key, mat_val, is_gt);
    EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0);
    EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val));
 }
 INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T));