more fix of mismatch functions on CPU OCL

12 years ago · ad6aae4583
parent 2c06e59a69
commit ad6aae4583
6 changed files with 231 additions and 798 deletions
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@ -44,6 +44,7 @@
 //M*/

 #include "precomp.hpp"
+
 #include <functional>
 #include <iterator>
 #include <vector>
@ -60,10 +61,11 @@ namespace cv
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -91,20 +93,21 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat

        std::string kernelName = "BruteForceMatch_UnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
                         const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
 {
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
           const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -130,21 +133,22 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,

        std::string kernelName = "BruteForceMatch_Match";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
           const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
 {
 }

 //radius_matchUnrolledCached
-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -176,15 +180,16 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist

        std::string kernelName = "BruteForceMatch_RadiusUnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

 //radius_match
-template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -214,263 +219,70 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c

        std::string kernelName = "BruteForceMatch_RadiusMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
-        //float *dis = (float *)clEnqueueMapBuffer(ctx->impl->clCmdQueue, (cl_mem)distance.data, CL_TRUE, CL_MAP_READ, 0, 8, 0, NULL, NULL, NULL);
-        //printf("%f, %f\n", dis[0], dis[1]);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-// with mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx,  distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, train, mask, trainIdx, distance, distType);
-    }
-}
-
-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
+        matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx,  distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
+        matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx,  distance, distType);
    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
-    }*/
    else
    {
-        match<16, T>(query, train, mask, trainIdx, distance, distType);
+        match<16>(query, train, tempMask, trainIdx, distance, distType);
    }
 }

-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
+        matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+        matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }*/
    else
    {
-        match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &trainIdx,
-                     const oclMat &imgIdx, const oclMat &distance, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
+        match<16>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
 }

 //radius matchDispatcher
-// with mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
+        matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+        matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &trainIdx,
-                     const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
    else
    {
-        radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &trainIdx,
-                     const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
+        radius_match<16>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
 }

 //knn match Dispatcher
-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                             const oclMat &trainIdx, const oclMat &distance, int distType)
 {
@ -501,11 +313,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl

        std::string kernelName = "BruteForceMatch_knnUnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
               const oclMat &trainIdx, const oclMat &distance, int distType)
 {
@ -534,11 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,

        std::string kernelName = "BruteForceMatch_knnMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -567,11 +379,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat

        std::string kernelName = "BruteForceMatch_calcDistanceUnrolled";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -598,69 +410,43 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask

        std::string kernelName = "BruteForceMatch_calcDistance";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Calc Distance dispatcher
-template < typename T/*, typename Mask*/ >
-void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                            const oclMat &allDist, int distType)
 {
    if (query.cols <= 64)
    {
-        calcDistanceUnrolled<16, 64, T>(query, train, mask, allDist, distType);
+        calcDistanceUnrolled<16, 64>(query, train, mask, allDist, distType);
    }
    else if (query.cols <= 128)
    {
-        calcDistanceUnrolled<16, 128, T>(query, train, mask, allDist, distType);
+        calcDistanceUnrolled<16, 128>(query, train, mask, allDist, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
-    }*/
    else
    {
-        calcDistance<16, T>(query, train, mask, allDist, distType);
+        calcDistance<16>(query, train, mask, allDist, distType);
    }
 }

-template < typename T/*, typename Mask*/ >
-void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    if (query.cols <= 64)
    {
-        knn_matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
+        knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        knn_matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+        knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }*/
    else
    {
-        knn_match<16, T>(query, train, mask, trainIdx, distance, distType);
+        knn_match<16>(query, train, mask, trainIdx, distance, distType);
    }
 }

@ -686,7 +472,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
        //args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols ));
        //args.push_back( make_pair( sizeof(cl_int), (void *)&query.step ));

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, trainIdx.depth(), -1);
    }
 }

@ -695,206 +481,22 @@ static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &
    findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
 }

-//with mask
-template < typename T/*, typename Mask*/ >
-void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
+static void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (k == 2)
    {
-        match2Dispatcher<T>(query, train, mask, trainIdx, distance, distType);
+        match2Dispatcher(query, train, tempMask, trainIdx, distance, distType);
    }
    else
    {
-        calcDistanceDispatcher<T>(query, train, mask, allDist, distType);
+        calcDistanceDispatcher(query, train, tempMask, allDist, distType);
        findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType);
    }
 }

-//without mask
-template < typename T/*, typename Mask*/ >
-void kmatchDispatcher(const oclMat &query, const oclMat &train, int k,
-                      const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
-{
-    oclMat mask;
-    if (k == 2)
-    {
-        match2Dispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        calcDistanceDispatcher<T>(query, train, mask, allDist, distType);
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType);
-    }
-}
-
-
-
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 0;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher< T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 0;
-
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 1;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 1;
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 2;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher< T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                          const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 2;
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-// knn caller
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 0;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k, trainIdx, distance, allDist, distType);
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 1;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k, trainIdx, distance, allDist, distType);
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 2;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k,  trainIdx, distance, allDist, distType);
-}
-
-//radius caller
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 0;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 1;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 2;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance,  nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
 cv::ocl::BruteForceMatcher_OCL_base::BruteForceMatcher_OCL_base(DistType distType_) : distType(distType_)
 {
 }
@ -929,38 +531,28 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const
 {
    if (query.empty() || train.empty())
        return;
+    
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "singleMatch";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance);
-
-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.cols == query.cols && train.type() == query.type());

-    const int nQuery = query.rows;
-    trainIdx.create(1, nQuery, CV_32S);
-    distance.create(1, nQuery, CV_32F);
+    trainIdx.create(1, query.rows, CV_32S);
+    distance.create(1, query.rows, CV_32F);

-    caller_t func = callers[distType][query.depth()];
-    func(query, train, mask, trainIdx, distance);
+    matchDispatcher(query, train, mask, trainIdx, distance, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, vector<DMatch> &matches)
@ -1062,40 +654,27 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
    if (query.empty() || trainCollection.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
-                             const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "matchCollection";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-            0/*matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);

-    const int nQuery = query.rows;
-
-    trainIdx.create(1, nQuery, CV_32S);
-    imgIdx.create(1, nQuery, CV_32S);
-    distance.create(1, nQuery, CV_32F);
-
-    caller_t func = callers[distType][query.depth()];
-    CV_Assert(func != 0);
+    trainIdx.create(1, query.rows, CV_32S);
+    imgIdx.create(1, query.rows, CV_32S);
+    distance.create(1, query.rows, CV_32F);

-    func(query, trainCollection, masks, trainIdx, imgIdx, distance);
+    matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, vector<DMatch> &matches)
@ -1164,52 +743,39 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co
    if (query.empty() || train.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, int k, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance, const oclMat & allDist);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();

-    static const caller_t callers[3][6] =
+    char cvFuncName[] = "knnMatchSingle";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);

-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
    if (k == 2)
    {
-        trainIdx.create(1, nQuery, CV_32SC2);
-        distance.create(1, nQuery, CV_32FC2);
+        trainIdx.create(1, query.rows, CV_32SC2);
+        distance.create(1, query.rows, CV_32FC2);
    }
    else
    {
-        trainIdx.create(nQuery, k, CV_32S);
-        distance.create(nQuery, k, CV_32F);
-        allDist.create(nQuery, nTrain, CV_32FC1);
+        trainIdx.create(query.rows, k, CV_32S);
+        distance.create(query.rows, k, CV_32F);
+        allDist.create(query.rows, train.rows, CV_32FC1);
    }

    trainIdx.setTo(Scalar::all(-1));

-    caller_t func = callers[distType][query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, k, mask, trainIdx, distance, allDist);
+    kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, vector< vector<DMatch> > &matches, bool compactResult)
@ -1394,8 +960,6 @@ namespace
 void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector< vector<DMatch> > &matches, int k,
        const vector<oclMat> &masks, bool compactResult)
 {
-
-
    if (k == 2)
    {
        oclMat trainCollection;
@ -1455,50 +1019,34 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query,
    if (query.empty() || train.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, float maxDistance, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance, const oclMat & nMatches);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "radiusMatchSingle";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    //#if 0
-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
-    //#endif
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
+    CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size()));

-    nMatches.create(1, nQuery, CV_32SC1);
+    nMatches.create(1, query.rows, CV_32SC1);
    if (trainIdx.empty())
    {
-        trainIdx.create(nQuery, std::max((nTrain / 100), 10), CV_32SC1);
-        distance.create(nQuery, std::max((nTrain / 100), 10), CV_32FC1);
+        trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1);
+        distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1);
    }

    nMatches.setTo(Scalar::all(0));

-    caller_t func = callers[distType][query.depth()];
-    //CV_Assert(func != 0);
-    //func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches);
+    matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
@ -1697,5 +1245,3 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
 }
-
-
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        //int flag = 0;

        oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
-        oclMat gsum(totalheight, gimg.cols + 1, CV_32SC1);
-        oclMat gsqsum(totalheight, gimg.cols + 1, CV_32FC1);
+        oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
+        oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);

        //cl_mem cascadebuffer;
        cl_mem stagebuffer;
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )

        bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;

-        if (!cv::ocl::Context::getContext()->impl->double_support && is_float)
+        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
        {
            CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
        }
@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
        
        cv::Mat dst(dst_a);
        a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-        if (!cv::ocl::Context::getContext()->impl->double_support)
+        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
        {
            for (int i = 0; i < contour->total; ++i)
            {
@ -161,7 +161,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
                a12 += dst.at<cl_long>(8, i);
                a03 += dst.at<cl_long>(9, i);
            }
-        } 
+        }
        else
        {
            a00 = cv::sum(dst.row(0))[0];
--- a/modules/ocl/src/opencl/brute_force_match.cl
+++ b/modules/ocl/src/opencl/brute_force_match.cl
@ -5,19 +5,93 @@ int bit1Count(float x)
 {
    int c = 0;
    int ix = (int)x;
-
    for (int i = 0 ; i < 32 ; i++)
    {
        c += ix & 0x1;
        ix >>= 1;
    }
-
    return (float)c;
 }
+
+float reduce_block(__local float *s_query,
+                   __local float *s_train,
+                   int block_size,
+                   int lidx,
+                   int lidy,
+                   int distType
+                  )
+{
+    /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+    sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+    float result = 0;
+    switch(distType)
+    {
+    case 0:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+        }
+        break;
+    case 1:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+            result += qr * qr;
+        }
+        break;
+    case 2:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
+        }
+        break;
+    }
+    return result;
+}
+
+float reduce_multi_block(__local float *s_query,
+                         __local float *s_train,
+                         int max_desc_len,
+                         int block_size,
+                         int block_index,
+                         int lidx,
+                         int lidy,
+                         int distType
+                        )
+{
+    /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+    sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+    float result = 0;
+    switch(distType)
+    {
+    case 0:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] -  s_train[j * block_size + lidx]);
+        }
+        break;
+    case 1:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            float qr = s_query[lidy * max_desc_len + block_index * block_size + j] -  s_train[j * block_size + lidx];
+            result += qr * qr;
+        }
+        break;
+    case 2:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+            result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+        }
+        break;
+    }
+    return result;
+}
+
 /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
 local size: dim0 is block_size, dim1 is block_size.
 */
-__kernel void BruteForceMatch_UnrollMatch(
+__kernel void BruteForceMatch_UnrollMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch(
    __local float *s_train = sharebuffer + block_size * max_desc_len;

    int queryIdx = groupidx * block_size + lidy;
-
    // load the query into local memory.
    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
    {
@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch(

    // loopUnrolledCached to find the best trainIdx and best distance.
    volatile int imgIdx = 0;
-
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
-
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            //load a block_size * block_size block into local train.
@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch(
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch(
    }

    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float *)(sharebuffer);
-    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+    __local float *s_distance = (__local float*)(sharebuffer);
+    __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);

    //find BestMatch
    s_distance += lidy * block_size;
@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    }
 }

-__kernel void BruteForceMatch_Match(
+__kernel void BruteForceMatch_Match_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match(
    {
        //Dist dist;
        float result = 0;
-
        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match(

            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match(
 }

 //radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
+__kernel void BruteForceMatch_RadiusUnrollMatch_D5(
    __global float *query,
    __global float *train,
    float maxDistance,
@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
-
    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);

-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch (distType)
-        {
-            case 0:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                }
-
-                break;
-            case 1:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-
-                break;
-            case 2:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                }
-
-                break;
-        }
+        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

        barrier(CLK_LOCAL_MEM_FENCE);
    }
@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if (ind < bestTrainIdx_cols)
+        if(ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
 }

 //radius_match
-__kernel void BruteForceMatch_RadiusMatch(
+__kernel void BruteForceMatch_RadiusMatch_D5(
    __global float *query,
    __global float *train,
    float maxDistance,
@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
-
    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch(
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);

-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch (distType)
-        {
-            case 0:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                }
-
-                break;
-            case 1:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-
-                break;
-            case 2:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                }
-
-                break;
-        }
+        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

        barrier(CLK_LOCAL_MEM_FENCE);
    }

    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
-        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+        unsigned int ind = atom_inc(nMatches + queryIdx);

-        if (ind < bestTrainIdx_cols)
+        if(ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch(
 }


-__kernel void BruteForceMatch_knnUnrollMatch(
+__kernel void BruteForceMatch_knnUnrollMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch(

    //loopUnrolledCached
    volatile int imgIdx = 0;
-
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
-
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            const int loadX = lidx + i * block_size;
@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
-
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    }
 }

-__kernel void BruteForceMatch_knnMatch(
+__kernel void BruteForceMatch_knnMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch(
    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0.0f;
-
-        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+        for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
            //load query and train into local memory
@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch(

            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
-
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch(
    }
 }

-kernel void BruteForceMatch_calcDistanceUnrolled(
+kernel void BruteForceMatch_calcDistanceUnrolled_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
    /* Todo */
 }

-kernel void BruteForceMatch_calcDistance(
+kernel void BruteForceMatch_calcDistance_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance(
    /* Todo */
 }

-kernel void BruteForceMatch_findBestMatch(
+kernel void BruteForceMatch_findBestMatch_D5(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                int4 data = *(__global int4*)&sum[glb_off];
                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);

+#if OFF
                lcldata[lcl_off] = data.x;
                lcldata[lcl_off+1] = data.y;
                lcldata[lcl_off+2] = data.z;
                lcldata[lcl_off+3] = data.w;
+#else
+                vstore4(data, 0, &lcldata[lcl_off]);
+#endif
            }

            lcloutindex[lcl_id] = 0;
@ -559,3 +563,7 @@ if(result)
 }
 }
 */
+
+
+
+
--- a/modules/ocl/test/test_brute_force_matcher.cpp
+++ b/modules/ocl/test/test_brute_force_matcher.cpp
@ -110,7 +110,7 @@ namespace
        }
    };

-    TEST_P(BruteForceMatcher, DISABLED_Match_Single)
+    TEST_P(BruteForceMatcher, Match_Single)
    {
        cv::ocl::BruteForceMatcher_OCL_base matcher(distType);

@ -130,7 +130,7 @@ namespace
        ASSERT_EQ(0, badCount);
    }

-    TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
+    TEST_P(BruteForceMatcher, KnnMatch_2_Single)
    {
        const int knn = 2;