Merge pull request #3256 from vbystricky:oclopt_BFMatcher

11 years ago · 322593e89f
parent 4de9605b7b 3787388eac
commit 322593e89f
2 changed files with 278 additions and 705 deletions
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@ -60,113 +60,58 @@ static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m)
        m.create(rows, cols, type);
 }
-
+static bool ocl_matchSingle(InputArray query, InputArray train,
-template < int BLOCK_SIZE, int MAX_DESC_LEN >
+        UMat &trainIdx, UMat &distance, int distType)
 static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train,
                     const UMat &trainIdx, const UMat &distance, int distType)
 {
-    int depth = _query.depth();
+    if (query.empty() || train.empty())
    cv::String opts;
    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
    ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    const int query_rows = query.rows();
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const int query_cols = query.cols();
    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    if(globalSize[0] != 0)
    {
        UMat query = _query.getUMat(), train = _train.getUMat();
        int idx = 0;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, (void *)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, (int)query.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
-template < int BLOCK_SIZE >
+    ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
-static bool ocl_match(InputArray _query, InputArray _train,
+    ensureSizeIsEnough(1, query_rows, CV_32F, distance);
                     const UMat &trainIdx, const UMat &distance, int distType)
 {
    int depth = _query.depth();
    cv::String opts;
    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
    ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    ocl::Device devDef = ocl::Device::getDefault();
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    if(globalSize[0] != 0)
+    UMat uquery = query.getUMat(), utrain = train.getUMat();
-    {
+    int kercn = 1;
-        UMat query = _query.getUMat(), train = _train.getUMat();
+    if (devDef.isIntel() &&
-
+        (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
-        int idx = 0;
+        (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
-        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        kercn = 4;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, (void *)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, (int)query.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
-static bool ocl_matchDispatcher(InputArray query, InputArray train,
+    int block_size = 16;
-                     const UMat &trainIdx, const UMat &distance, int distType)
+    int max_desc_len = 0;
-{
+    bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU;
    int query_cols = query.size().width;
    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
    if (query_cols <= 64)
-    {
+        max_desc_len = 64 / kercn;
        if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false;
    }
    else if (query_cols <= 128 && !is_cpu)
-    {
+        max_desc_len = 128 / kercn;
        if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx,  distance, distType)) return false;
    }
    else
    {
        if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false;
    }
    return true;
 }
-static bool ocl_matchSingle(InputArray query, InputArray train,
+    int depth = query.depth();
-        UMat &trainIdx, UMat &distance, int dstType)
+    cv::String opts;
-{
+    opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-    if (query.empty() || train.empty())
+        ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len);
    ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    int query_rows = query.size().height;
+    size_t globalSize[] = {(query.size().height + block_size - 1) / block_size * block_size, block_size};
-
+    size_t localSize[] = {block_size, block_size};
-    ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
+
-    ensureSizeIsEnough(1, query_rows, CV_32F, distance);
+    int idx = 0;
-
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
-    return ocl_matchDispatcher(query, train, trainIdx, distance, dstType);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
    idx = k.set(idx, uquery.rows);
    idx = k.set(idx, uquery.cols);
    idx = k.set(idx, utrain.rows);
    idx = k.set(idx, utrain.cols);
    idx = k.set(idx, (int)(uquery.step / sizeof(float)));
    return k.run(2, globalSize, localSize, false);
 }
 static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches)
@ -213,121 +158,60 @@ static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::v
    return ocl_matchConvert(trainIdxCPU, distanceCPU, matches);
 }
-template < int BLOCK_SIZE, int MAX_DESC_LEN >
+static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
-static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train,
+                               UMat &distance, int distType)
                             const UMat &trainIdx, const UMat &distance, int distType)
 {
-    int depth = _query.depth();
+    if (query.empty() || train.empty())
    cv::String opts;
    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
    ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    const int query_rows = query.rows();
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const int query_cols = query.cols();
    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    if(globalSize[0] != 0)
    {
        UMat query = _query.getUMat(), train = _train.getUMat();
        int idx = 0;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, (void *)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, (int)query.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
-template < int BLOCK_SIZE >
+    ensureSizeIsEnough(1, query_rows, CV_32SC2, trainIdx);
-static bool ocl_knn_match(InputArray _query, InputArray _train,
+    ensureSizeIsEnough(1, query_rows, CV_32FC2, distance);
               const UMat &trainIdx, const UMat &distance, int distType)
 {
    int depth = _query.depth();
    cv::String opts;
    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
    ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    trainIdx.setTo(Scalar::all(-1));
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    if(globalSize[0] != 0)
+    ocl::Device devDef = ocl::Device::getDefault();
    {
        UMat query = _query.getUMat(), train = _train.getUMat();
        int idx = 0;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, (void*)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, (int)query.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
-static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType)
+    UMat uquery = query.getUMat(), utrain = train.getUMat();
-{
+    int kercn = 1;
-    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+    if (devDef.isIntel() &&
-    if (query.size().width <= 64)
+        (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
-    {
+        (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
-        if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType))
+        kercn = 4;
            return false;
    }
    else if (query.size().width <= 128 && !is_cpu)
    {
        if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType))
            return false;
    }
    else
    {
        if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType))
            return false;
    }
    return true;
 }
-static bool ocl_kmatchDispatcher(InputArray query, InputArray train, const UMat &trainIdx,
+    int block_size = 16;
-                                 const UMat &distance, int distType)
+    int max_desc_len = 0;
-{
+    bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU;
-    return ocl_match2Dispatcher(query, train, trainIdx, distance, distType);
+    if (query_cols <= 64)
-}
+        max_desc_len = 64 / kercn;
    else if (query_cols <= 128 && !is_cpu)
        max_desc_len = 128 / kercn;
-static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
+    int depth = query.depth();
-                               UMat &distance, int dstType)
+    cv::String opts;
-{
+    opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-    if (query.empty() || train.empty())
+        ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len);
    ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
-    const int nQuery = query.size().height;
+    size_t globalSize[] = {(query_rows + block_size - 1) / block_size * block_size, block_size};
-
+    size_t localSize[] = {block_size, block_size};
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
+
-    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
+    int idx = 0;
-
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
-    trainIdx.setTo(Scalar::all(-1));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
-
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
-    return ocl_kmatchDispatcher(query, train, trainIdx, distance, dstType);
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
    idx = k.set(idx, uquery.rows);
    idx = k.set(idx, uquery.cols);
    idx = k.set(idx, utrain.rows);
    idx = k.set(idx, utrain.cols);
    idx = k.set(idx, (int)(uquery.step / sizeof(float)));
    return k.run(2, globalSize, localSize, false);
 }
 static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
@ -383,134 +267,64 @@ static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std
    Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
    Mat distanceCPU = distance.getMat(ACCESS_READ);
-    if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) )
+    return ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
        return true;
    return false;
 }
 template < int BLOCK_SIZE, int MAX_DESC_LEN >
 static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance,
                  const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
 {
    int depth = _query.depth();
    cv::String opts;
    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN);
    ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
    size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    if(globalSize[0] != 0)
    {
        UMat query = _query.getUMat(), train = _train.getUMat();
        int idx = 0;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, maxDistance);
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
        idx = k.set(idx, (void*)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, trainIdx.cols);
        idx = k.set(idx, (int)query.step);
        idx = k.set(idx, (int)trainIdx.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
 //radius_match
 template < int BLOCK_SIZE >
 static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance,
                  const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
 {
    int depth = _query.depth();
    cv::String opts;
    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
    ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if(k.empty())
        return false;
    size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    if(globalSize[0] != 0)
    {
        UMat query = _query.getUMat(), train = _train.getUMat();
        int idx = 0;
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
        idx = k.set(idx, maxDistance);
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
        idx = k.set(idx, (void*)NULL, smemSize);
        idx = k.set(idx, query.rows);
        idx = k.set(idx, query.cols);
        idx = k.set(idx, train.rows);
        idx = k.set(idx, train.cols);
        idx = k.set(idx, trainIdx.cols);
        idx = k.set(idx, (int)query.step);
        idx = k.set(idx, (int)trainIdx.step);
        return k.run(2, globalSize, localSize, false);
    }
    return true;
 }
 static bool ocl_rmatchDispatcher(InputArray query, InputArray train,
        UMat &trainIdx,   UMat &distance, UMat &nMatches, float maxDistance, int distType)
 {
    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
    int query_cols = query.size().width;
    if (query_cols <= 64)
    {
        if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
    }
    else if (query_cols <= 128 && !is_cpu)
    {
        if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
    }
    else
    {
        if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
    }
    return true;
 }
 static bool ocl_radiusMatchSingle(InputArray query, InputArray train,
        UMat &trainIdx,   UMat &distance, UMat &nMatches, float maxDistance, int distType)
 {
    if (query.empty() || train.empty())
        return false;
-    const int nQuery = query.size().height;
+    const int query_rows = query.rows();
-    const int nTrain = train.size().height;
+    const int train_rows = train.rows();
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
+    ensureSizeIsEnough(1, query_rows, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
+        ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
+        ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32FC1, distance);
    }
    nMatches.setTo(Scalar::all(0));
-    return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType);
+    ocl::Device devDef = ocl::Device::getDefault();
    UMat uquery = query.getUMat(), utrain = train.getUMat();
    int kercn = 1;
    if (devDef.isIntel() &&
        (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
        (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
        kercn = 4;
    int block_size = 16;
    int depth = query.depth();
    cv::String opts;
    opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size);
    ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
    if (k.empty())
        return false;
    size_t globalSize[] = {(train_rows + block_size - 1) / block_size * block_size, (query_rows + block_size - 1) / block_size * block_size, 1};
    size_t localSize[] = {block_size, block_size, 1};
    int idx = 0;
    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
    idx = k.set(idx, maxDistance);
    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
    idx = k.set(idx, uquery.rows);
    idx = k.set(idx, uquery.cols);
    idx = k.set(idx, utrain.rows);
    idx = k.set(idx, utrain.cols);
    idx = k.set(idx, trainIdx.cols);
    idx = k.set(idx, (int)(uquery.step / sizeof(float)));
    idx = k.set(idx, (int)(trainIdx.step / sizeof(int)));
    return k.run(2, globalSize, localSize, false);
 }
 static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches,
--- a/modules/features2d/src/opencl/brute_force_match.cl
+++ b/modules/features2d/src/opencl/brute_force_match.cl
@ -59,39 +59,71 @@
 #define MAX_DESC_LEN 64
 #endif
 #define BLOCK_SIZE_ODD          (BLOCK_SIZE + 1)
 #ifndef SHARED_MEM_SZ
 #  if (BLOCK_SIZE < MAX_DESC_LEN)
 #    define SHARED_MEM_SZ      (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
 #  else
 #    define SHARED_MEM_SZ      (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
 #  endif
 #endif
 #ifndef DIST_TYPE
 #define DIST_TYPE 2
 #endif
 // dirty fix for non-template support
-#if   (DIST_TYPE == 2) // L1Dist
+#if (DIST_TYPE == 2) // L1Dist
 #   ifdef T_FLOAT
 #       define DIST(x, y) fabs((x) - (y))
        typedef float value_type;
        typedef float result_type;
 #       if (8 == kercn)
            typedef float8 value_type;
 #           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
 #       elif (4 == kercn)
            typedef float4 value_type;
 #           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
 #       else
            typedef float value_type;
 #           define DIST(x, y) result += fabs((x) - (y))
 #       endif
 #   else
 #       define DIST(x, y) abs((x) - (y))
        typedef int value_type;
        typedef int result_type;
 #       if (8 == kercn)
            typedef int8 value_type;
 #           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
 #       elif (4 == kercn)
            typedef int4 value_type;
 #           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
 #       else
            typedef int  value_type;
 #           define DIST(x, y) result += abs((x) - (y))
 #       endif
 #   endif
-#define DIST_RES(x) (x)
+#   define DIST_RES(x) (x)
 #elif (DIST_TYPE == 4) // L2Dist
-#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
+    typedef float result_type;
-typedef float value_type;
+#   if (8 == kercn)
-typedef float result_type;
+        typedef float8 value_type;
-#define DIST_RES(x) sqrt(x)
+#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
 #   elif (4 == kercn)
        typedef float4      value_type;
 #       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d, d);}
 #   else
        typedef float       value_type;
 #       define DIST(x, y)   {value_type d = ((x) - (y)); result = mad(d, d, result);}
 #   endif
 #   define DIST_RES(x) sqrt(x)
 #elif (DIST_TYPE == 6) // Hamming
-//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+#   if (8 == kercn)
-inline int bit1Count(int v)
+        typedef int8 value_type;
-{
+#   elif (4 == kercn)
-    v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
+        typedef int4 value_type;
-    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
+#   else
-    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
+        typedef int value_type;
-}
+#   endif
-#define DIST(x, y) bit1Count( (x) ^ (y) )
+    typedef int result_type;
-typedef int value_type;
+#   define DIST(x, y) result += popcount( (x) ^ (y) )
-typedef int result_type;
+#   define DIST_RES(x) (x)
 #define DIST_RES(x) (x)
 #endif
 inline result_type reduce_block(
@ -105,9 +137,7 @@ inline result_type reduce_block(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
+        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
            s_query[lidy * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
    return DIST_RES(result);
 }
@ -123,11 +153,9 @@ inline result_type reduce_block_match(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
+        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
            s_query[lidy * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
-    return (result);
+    return result;
 }
 inline result_type reduce_multi_block(
@ -142,23 +170,16 @@ inline result_type reduce_multi_block(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
+        DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
            s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
    return result;
 }
-/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
+__kernel void BruteForceMatch_Match(
 local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
 */
 __kernel void BruteForceMatch_UnrollMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@ -170,17 +191,26 @@ __kernel void BruteForceMatch_UnrollMatch(
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
    const int queryOffset = min(queryIdx, query_rows - 1) * step;
    __global TN *query_vec = (__global TN *)(query + queryOffset);
    query_cols /= kercn;
    __local float sharebuffer[SHARED_MEM_SZ];
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
-    int queryIdx = groupidx * BLOCK_SIZE + lidy;
+#if 0 < MAX_DESC_LEN
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    // load the query into local memory.
    #pragma unroll
-    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
    {
-        int loadx = lidx + i * BLOCK_SIZE;
+        const int loadx = mad24(BLOCK_SIZE, i, lidx);
-        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
    }
 #else
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
 #endif
    float myBestDistance = MAX_FLOAT;
    int myBestTrainIdx = -1;
@ -189,12 +219,16 @@ __kernel void BruteForceMatch_UnrollMatch(
    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
    {
        result_type result = 0;
        const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
        __global TN *train_vec = (__global TN *)(train + trainOffset);
 #if 0 < MAX_DESC_LEN
        #pragma unroll
-        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
+        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-            const int loadx = lidx + i * BLOCK_SIZE;
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
-            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
@ -203,89 +237,18 @@ __kernel void BruteForceMatch_UnrollMatch(
            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
+#else
-        result = DIST_RES(result);
+        for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
        int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
        {
            myBestDistance = result;
            myBestTrainIdx = trainIdx;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float*)(sharebuffer);
    __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    //find BestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance;
    s_trainIdx[lidx] = myBestTrainIdx;
    barrier(CLK_LOCAL_MEM_FENCE);
    //reduce -- now all reduce implement in each threads.
    #pragma unroll
    for (int k = 0 ; k < BLOCK_SIZE; k++)
    {
        if (myBestDistance > s_distance[k])
        {
            myBestDistance = s_distance[k];
            myBestTrainIdx = s_trainIdx[k];
        }
    }
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = myBestTrainIdx;
        bestDistance[queryIdx] = myBestDistance;
    }
 }
 __kernel void BruteForceMatch_Match(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
    float myBestDistance = MAX_FLOAT;
    int myBestTrainIdx = -1;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    // loop
    for (int t = 0 ;  t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0;
        for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
        {
-            const int loadx = lidx + i * BLOCK_SIZE;
+            const int loadx = mad24(i, BLOCK_SIZE, lidx);
            //load query and train into local memory
-            s_query[lidy * BLOCK_SIZE + lidx] = 0;
+            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
-            s_train[lidx * BLOCK_SIZE + lidy] = 0;
+            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
            if (loadx < query_cols)
            {
-                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+                s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
-                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+                s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
@ -294,10 +257,10 @@ __kernel void BruteForceMatch_Match(
            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
+#endif
        result = DIST_RES(result);
-        const int trainIdx = t * BLOCK_SIZE + lidx;
+        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
        {
@ -309,17 +272,18 @@ __kernel void BruteForceMatch_Match(
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
    //findBestMatch
-    s_distance += lidy * BLOCK_SIZE;
+    s_distance += lidy * BLOCK_SIZE_ODD;
-    s_trainIdx += lidy * BLOCK_SIZE;
+    s_trainIdx += lidy * BLOCK_SIZE_ODD;
    s_distance[lidx] = myBestDistance;
    s_trainIdx[lidx] = myBestTrainIdx;
    barrier(CLK_LOCAL_MEM_FENCE);
    //reduce -- now all reduce implement in each threads.
    #pragma unroll
    for (int k = 0 ; k < BLOCK_SIZE; k++)
    {
        if (myBestDistance > s_distance[k])
@ -336,76 +300,14 @@ __kernel void BruteForceMatch_Match(
    }
 }
 //radius_unrollmatch
 __kernel void BruteForceMatch_RadiusUnrollMatch(
    __global T *query,
    __global T *train,
    float maxDistance,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int bestTrainIdx_cols,
    int step,
    int ostep
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int groupidy = get_group_id(1);
    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    result_type result = 0;
    for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
    {
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
        const int loadx = lidx + i * BLOCK_SIZE;
        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);
        result += reduce_block(s_query, s_train, lidx, lidy);
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (queryIdx < query_rows && trainIdx < train_rows &&
        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
        int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
        if(ind < bestTrainIdx_cols)
        {
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
        }
    }
 }
 //radius_match
 __kernel void BruteForceMatch_RadiusMatch(
    __global T *query,
    __global T *train,
    float maxDistance,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@ -420,20 +322,34 @@ __kernel void BruteForceMatch_RadiusMatch(
    const int groupidx = get_group_id(0);
    const int groupidy = get_group_id(1);
-    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
+    const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
-    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
+    const int queryOffset = min(queryIdx, query_rows - 1) * step;
    __global TN *query_vec = (__global TN *)(query + queryOffset);
    const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
    const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
    __global TN *train_vec = (__global TN *)(train + trainOffset);
    query_cols /= kercn;
    __local float sharebuffer[SHARED_MEM_SZ];
    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
    result_type result = 0;
    for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
    {
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-        const int loadx = lidx + i * BLOCK_SIZE;
+        const int loadx = mad24(BLOCK_SIZE, i, lidx);
        s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
        s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
-        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        if (loadx < query_cols)
-        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        {
            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
        }
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);
@ -442,28 +358,23 @@ __kernel void BruteForceMatch_RadiusMatch(
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-
+    if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
    if (queryIdx < query_rows && trainIdx < train_rows &&
        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
        int ind = atom_inc(nMatches + queryIdx);
        if(ind < bestTrainIdx_cols)
        {
-            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+            bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
-            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+            bestDistance[mad24(queryIdx, ostep, ind)] = result;
        }
    }
 }
-
+__kernel void BruteForceMatch_knnMatch(
 __kernel void BruteForceMatch_knnUnrollMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@ -475,31 +386,45 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
-    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
+    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
    const int queryOffset = min(queryIdx, query_rows - 1) * step;
    __global TN *query_vec = (__global TN *)(query + queryOffset);
    query_cols /= kercn;
    __local float sharebuffer[SHARED_MEM_SZ];
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
 #if 0 < MAX_DESC_LEN
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    // load the query into local memory.
    #pragma unroll
    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
    {
-        int loadx = lidx + i * BLOCK_SIZE;
+        int loadx = mad24(BLOCK_SIZE, i, lidx);
-        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
    }
 #else
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
 #endif
    float myBestDistance1 = MAX_FLOAT;
    float myBestDistance2 = MAX_FLOAT;
    int myBestTrainIdx1 = -1;
    int myBestTrainIdx2 = -1;
-    //loopUnrolledCached
+    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
    for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0;
        int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
        __global TN *train_vec = (__global TN *)(train + trainOffset);
 #if 0 < MAX_DESC_LEN
        #pragma unroll
        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-            const int loadx = lidx + i * BLOCK_SIZE;
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
-            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
@ -508,143 +433,18 @@ __kernel void BruteForceMatch_knnUnrollMatch(
            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
+#else
-        result = DIST_RES(result);
+        for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
        const int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows)
        {
            if (result < myBestDistance1)
            {
                myBestDistance2 = myBestDistance1;
                myBestTrainIdx2 = myBestTrainIdx1;
                myBestDistance1 = result;
                myBestTrainIdx1 = trainIdx;
            }
            else if (result < myBestDistance2)
            {
                myBestDistance2 = result;
                myBestTrainIdx2 = trainIdx;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (local float *)sharebuffer;
    __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    // find BestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance1;
    s_trainIdx[lidx] = myBestTrainIdx1;
    float bestDistance1 = MAX_FLOAT;
    float bestDistance2 = MAX_FLOAT;
    int bestTrainIdx1 = -1;
    int bestTrainIdx2 = -1;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
                bestTrainIdx2 = bestTrainIdx1;
                bestDistance1 = val;
                bestTrainIdx1 = s_trainIdx[i];
            }
            else if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    s_distance[lidx] = myBestDistance2;
    s_trainIdx[lidx] = myBestTrainIdx2;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    myBestDistance1 = bestDistance1;
    myBestDistance2 = bestDistance2;
    myBestTrainIdx1 = bestTrainIdx1;
    myBestTrainIdx2 = bestTrainIdx2;
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
    }
 }
 __kernel void BruteForceMatch_knnMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    float myBestDistance1 = MAX_FLOAT;
    float myBestDistance2 = MAX_FLOAT;
    int myBestTrainIdx1 = -1;
    int myBestTrainIdx2 = -1;
    //loop
    for (int  t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0.0f;
        for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
        {
-            const int loadx = lidx + i * BLOCK_SIZE;
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
            //load query and train into local memory
-            s_query[lidy * BLOCK_SIZE + lidx] = 0;
+            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
-            s_train[lidx * BLOCK_SIZE + lidy] = 0;
+            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
            if (loadx < query_cols)
            {
-                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+                s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
-                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+                s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
@ -653,12 +453,12 @@ __kernel void BruteForceMatch_knnMatch(
            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
+#endif
        result = DIST_RES(result);
-        const int trainIdx = t * BLOCK_SIZE + lidx;
+        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
-        if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+        if (queryIdx < query_rows && trainIdx < train_rows)
        {
            if (result < myBestDistance1)
            {
@ -678,12 +478,11 @@ __kernel void BruteForceMatch_knnMatch(
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
    //findBestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    // find BestMatch
    s_distance += lidy * BLOCK_SIZE_ODD;
    s_trainIdx += lidy * BLOCK_SIZE_ODD;
    s_distance[lidx] = myBestDistance1;
    s_trainIdx[lidx] = myBestTrainIdx1;
@ -747,43 +546,3 @@ __kernel void BruteForceMatch_knnMatch(
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
    }
 }
 kernel void BruteForceMatch_calcDistanceUnrolled(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step)
 {
    /* Todo */
 }
 kernel void BruteForceMatch_calcDistance(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step)
 {
    /* Todo */
 }
 kernel void BruteForceMatch_findBestMatch(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    int k
 )
 {
    /* Todo */
 }