diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 76f76ebc12..eafecd93cc 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -43,7 +43,7 @@ if(OPENCL_FOUND) set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) if (X86_64) - set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) + set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64/import) elseif (X86) set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) endif() diff --git a/cmake/cl2cpp.cmake b/cmake/cl2cpp.cmake index ca17c61b43..4f18e9e643 100644 --- a/cmake/cl2cpp.cmake +++ b/cmake/cl2cpp.cmake @@ -18,6 +18,7 @@ foreach(cl ${cl_list}) string(REPLACE "\t" " " lines "${lines}") string(REGEX REPLACE "/\\*([^*]/|\\*[^/]|[^*/])*\\*/" "" lines "${lines}") # multiline comments + string(REGEX REPLACE "/\\*([^\n])*\\*/" "" lines "${lines}") # single-line comments string(REGEX REPLACE "[ ]*//[^\n]*\n" "\n" lines "${lines}") # single-line comments string(REGEX REPLACE "\n[ ]*(\n[ ]*)*" "\n" lines "${lines}") # empty lines & leading whitespace string(REGEX REPLACE "^\n" "" lines "${lines}") # leading new line diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 818f3c18f3..e61a9f6330 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -44,6 +44,7 @@ //M*/ #include "precomp.hpp" + #include #include #include @@ -60,10 +61,11 @@ namespace cv } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -91,20 +93,21 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_UnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -130,21 +133,22 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_Match"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } //radius_matchUnrolledCached -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -176,15 +180,16 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } //radius_match -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -214,263 +219,70 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c std::string kernelName = "BruteForceMatch_RadiusMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); - //float *dis = (float *)clEnqueueMapBuffer(ctx->impl->clCmdQueue, (cl_mem)distance.data, CL_TRUE, CL_MAP_READ, 0, 8, 0, NULL, NULL, NULL); - //printf("%f, %f\n", dis[0], dis[1]); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -// with mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); - }*/ - else - { - match<16, T>(query, train, mask, trainIdx, distance, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &trainIdx, const oclMat &distance, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); + matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); + matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx, distance, distType); } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); - }*/ else { - match<16, T>(query, train, mask, trainIdx, distance, distType); + match<16>(query, train, tempMask, trainIdx, distance, distType); } } -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask, const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); + matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ else { - match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } -} - -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &trainIdx, - const oclMat &imgIdx, const oclMat &distance, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ - else - { - match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); + match<16>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } } //radius matchDispatcher -// with mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); + matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ - else - { - radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &trainIdx, - const oclMat &distance, const oclMat &nMatches, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ else { - radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &trainIdx, - const oclMat &distance, const oclMat &nMatches, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); + radius_match<16>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } } //knn match Dispatcher -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { @@ -501,11 +313,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl std::string kernelName = "BruteForceMatch_knnUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { @@ -534,11 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_knnMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -567,11 +379,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_calcDistanceUnrolled"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -598,69 +410,43 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask std::string kernelName = "BruteForceMatch_calcDistance"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } /////////////////////////////////////////////////////////////////////////////// // Calc Distance dispatcher -template < typename T/*, typename Mask*/ > -void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) { if (query.cols <= 64) { - calcDistanceUnrolled<16, 64, T>(query, train, mask, allDist, distType); + calcDistanceUnrolled<16, 64>(query, train, mask, allDist, distType); } else if (query.cols <= 128) { - calcDistanceUnrolled<16, 128, T>(query, train, mask, allDist, distType); + calcDistanceUnrolled<16, 128>(query, train, mask, allDist, distType); } - /*else if (query.cols <= 256) - { - calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 512) - { - calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 1024) - { - calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); - }*/ else { - calcDistance<16, T>(query, train, mask, allDist, distType); + calcDistance<16>(query, train, mask, allDist, distType); } } -template < typename T/*, typename Mask*/ > -void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, int distType) { if (query.cols <= 64) { - knn_matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); + knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType); } else if (query.cols <= 128) { - knn_matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ else { - knn_match<16, T>(query, train, mask, trainIdx, distance, distType); + knn_match<16>(query, train, mask, trainIdx, distance, distType); } } @@ -686,7 +472,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o //args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); //args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, trainIdx.depth(), -1); } } @@ -695,206 +481,22 @@ static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat & findKnnMatch<256>(k, trainIdx, distance, allDist, distType); } -//with mask -template < typename T/*, typename Mask*/ > -void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask, +static void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (k == 2) { - match2Dispatcher(query, train, mask, trainIdx, distance, distType); + match2Dispatcher(query, train, tempMask, trainIdx, distance, distType); } else { - calcDistanceDispatcher(query, train, mask, allDist, distType); + calcDistanceDispatcher(query, train, tempMask, allDist, distType); findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType); } } -//without mask -template < typename T/*, typename Mask*/ > -void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) -{ - oclMat mask; - if (k == 2) - { - match2Dispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - calcDistanceDispatcher(query, train, mask, allDist, distType); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType); - } -} - - - -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 0; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher< T >(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 0; - - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 1; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 1; - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 2; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher< T >(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 2; - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -// knn caller -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 0; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 1; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 2; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -//radius caller -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 0; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 1; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 2; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - cv::ocl::BruteForceMatcher_OCL_base::BruteForceMatcher_OCL_base(DistType distType_) : distType(distType_) { } @@ -929,38 +531,28 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const { if (query.empty() || train.empty()) return; + + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "singleMatch"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - typedef void (*caller_t)(const oclMat & query, const oclMat & train, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance); - - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.cols == query.cols && train.type() == query.type()); - const int nQuery = query.rows; - trainIdx.create(1, nQuery, CV_32S); - distance.create(1, nQuery, CV_32F); + trainIdx.create(1, query.rows, CV_32S); + distance.create(1, query.rows, CV_32F); - caller_t func = callers[distType][query.depth()]; - func(query, train, mask, trainIdx, distance); + matchDispatcher(query, train, mask, trainIdx, distance, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, vector &matches) @@ -1062,40 +654,27 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c if (query.empty() || trainCollection.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks, - const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "matchCollection"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, - 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, - 0/*matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); - const int nQuery = query.rows; - - trainIdx.create(1, nQuery, CV_32S); - imgIdx.create(1, nQuery, CV_32S); - distance.create(1, nQuery, CV_32F); - - caller_t func = callers[distType][query.depth()]; - CV_Assert(func != 0); + trainIdx.create(1, query.rows, CV_32S); + imgIdx.create(1, query.rows, CV_32S); + distance.create(1, query.rows, CV_32F); - func(query, trainCollection, masks, trainIdx, imgIdx, distance); + matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, vector &matches) @@ -1164,52 +743,39 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co if (query.empty() || train.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & train, int k, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance, const oclMat & allDist); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); - static const caller_t callers[3][6] = + char cvFuncName[] = "knnMatchSingle"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); - const int nQuery = query.rows; - const int nTrain = train.rows; - if (k == 2) { - trainIdx.create(1, nQuery, CV_32SC2); - distance.create(1, nQuery, CV_32FC2); + trainIdx.create(1, query.rows, CV_32SC2); + distance.create(1, query.rows, CV_32FC2); } else { - trainIdx.create(nQuery, k, CV_32S); - distance.create(nQuery, k, CV_32F); - allDist.create(nQuery, nTrain, CV_32FC1); + trainIdx.create(query.rows, k, CV_32S); + distance.create(query.rows, k, CV_32F); + allDist.create(query.rows, train.rows, CV_32FC1); } trainIdx.setTo(Scalar::all(-1)); - caller_t func = callers[distType][query.depth()]; - CV_Assert(func != 0); - - func(query, train, k, mask, trainIdx, distance, allDist); + kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, vector< vector > &matches, bool compactResult) @@ -1394,8 +960,6 @@ namespace void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector< vector > &matches, int k, const vector &masks, bool compactResult) { - - if (k == 2) { oclMat trainCollection; @@ -1455,50 +1019,34 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, if (query.empty() || train.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & train, float maxDistance, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance, const oclMat & nMatches); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "radiusMatchSingle"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - //#if 0 - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; - //#endif - - const int nQuery = query.rows; - const int nTrain = train.rows; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); - CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); + CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size())); - nMatches.create(1, nQuery, CV_32SC1); + nMatches.create(1, query.rows, CV_32SC1); if (trainIdx.empty()) { - trainIdx.create(nQuery, std::max((nTrain / 100), 10), CV_32SC1); - distance.create(nQuery, std::max((nTrain / 100), 10), CV_32FC1); + trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1); + distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1); } nMatches.setTo(Scalar::all(0)); - caller_t func = callers[distType][query.depth()]; - //CV_Assert(func != 0); - //func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); - func(query, train, maxDistance, mask, trainIdx, distance, nMatches); + matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, @@ -1697,5 +1245,3 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); } - - diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 4e0f5b85d3..1c727f01f9 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS //int flag = 0; oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1); - oclMat gsum(totalheight, gimg.cols + 1, CV_32SC1); - oclMat gsqsum(totalheight, gimg.cols + 1, CV_32FC1); + oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1); + oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1); //cl_mem cascadebuffer; cl_mem stagebuffer; diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp index ab867d4d31..1f76d633dc 100644 --- a/modules/ocl/src/match_template.cpp +++ b/modules/ocl/src/match_template.cpp @@ -71,6 +71,9 @@ namespace cv void matchTemplate_SQDIFF_NORMED( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); + void convolve_32F( + const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); + void matchTemplate_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); @@ -90,41 +93,65 @@ namespace cv void matchTemplateNaive_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, int cn); + void extractFirstChannel_32F( + const oclMat &image, oclMat &result); + // Evaluates optimal template's area threshold. If // template's area is less than the threshold, we use naive match // template version, otherwise FFT-based (if available) - static int getTemplateThreshold(int method, int depth) + static bool useNaive(int , int , Size ) { - switch (method) - { - case CV_TM_CCORR: - if (depth == CV_32F) return 250; - if (depth == CV_8U) return 300; - break; - case CV_TM_SQDIFF: - if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F - if (depth == CV_8U) return 300; - break; - } - CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode"); - return 0; + // FIXME! + // always use naive until convolve is imported + return true; } ////////////////////////////////////////////////////////////////////// // SQDIFF void matchTemplate_SQDIFF( - const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &) + const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size())) { matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels()); return; } else { - // TODO - CV_Error(CV_StsBadArg, "Not supported yet for this size template"); + buf.image_sqsums.resize(1); + + // TODO, add double support for ocl::integral + // use CPU integral temporarily + Mat sums, sqsums; + cv::integral(Mat(image.reshape(1)), sums, sqsums); + buf.image_sqsums[0] = sqsums; + + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; + matchTemplate_CCORR(image, templ, result, buf); + + //port CUDA's matchTemplatePrepared_SQDIFF_8U + Context *clCxt = image.clCxt; + string kernelName = "matchTemplate_Prepared_SQDIFF"; + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + + const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : ""; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt); } } @@ -134,7 +161,6 @@ namespace cv matchTemplate_CCORR(image, templ, result, buf); buf.image_sums.resize(1); - integral(image.reshape(1), buf.image_sums[0]); unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; @@ -156,7 +182,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); } @@ -191,33 +217,39 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } ////////////////////////////////////////////////////////////////////// // CCORR + void convolve_32F( + const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &) + { + CV_Error(-1, "convolve is not fully implemented yet"); + } + void matchTemplate_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + if (useNaive(CV_TM_CCORR, image.depth(), templ.size())) { matchTemplateNaive_CCORR(image, templ, result, image.oclchannels()); return; } else { - CV_Error(CV_StsBadArg, "Not supported yet for this size template"); if(image.depth() == CV_8U && templ.depth() == CV_8U) { image.convertTo(buf.imagef, CV_32F); templ.convertTo(buf.templf, CV_32F); + convolve_32F(buf.imagef, buf.templf, result, buf); + } + else + { + convolve_32F(image, templ, result, buf); } - CV_Assert(image.oclchannels() == 1); - oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels())); - filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0)); - result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1)); } } @@ -249,7 +281,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); } @@ -284,7 +316,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } ////////////////////////////////////////////////////////////////////// @@ -301,7 +333,7 @@ namespace cv kernelName = "matchTemplate_Prepared_CCOFF"; size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); @@ -313,22 +345,22 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + Vec4f templ_sum = Vec4f::all(0); // to be continued in the following section if(image.oclchannels() == 1) { buf.image_sums.resize(1); integral(image, buf.image_sums[0]); - float templ_sum = 0; - templ_sum = (float)sum(templ)[0] / templ.size().area(); + templ_sum[0] = (float)sum(templ)[0] / templ.size().area(); args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) ); } else { - Vec4f templ_sum = Vec4f::all(0); + split(image, buf.images); templ_sum = sum(templ) / templ.size().area(); buf.image_sums.resize(buf.images.size()); @@ -374,7 +406,7 @@ namespace cv kernelName = "matchTemplate_Prepared_CCOFF_NORMED"; size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); @@ -387,20 +419,22 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_float), (void *)&scale) ); + + Vec4f templ_sum = Vec4f::all(0); + Vec4f templ_sqsum = Vec4f::all(0); // to be continued in the following section if(image.oclchannels() == 1) { buf.image_sums.resize(1); buf.image_sqsums.resize(1); integral(image, buf.image_sums[0], buf.image_sqsums[0]); - float templ_sum = 0; - float templ_sqsum = 0; - templ_sum = (float)sum(templ)[0]; - templ_sqsum = sqrSum(templ)[0]; + templ_sum[0] = (float)sum(templ)[0]; - templ_sqsum -= scale * templ_sum * templ_sum; - templ_sum *= scale; + templ_sqsum[0] = sqrSum(templ)[0]; + + templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0]; + templ_sum[0] *= scale; args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); @@ -408,13 +442,11 @@ namespace cv args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) ); } else { - Vec4f templ_sum = Vec4f::all(0); - Vec4f templ_sqsum = Vec4f::all(0); split(image, buf.images); templ_sum = sum(templ); @@ -465,7 +497,27 @@ namespace cv } openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } + void extractFirstChannel_32F(const oclMat &image, oclMat &result) + { + Context *clCxt = image.clCxt; + string kernelName; + + kernelName = "extractFirstChannel"; + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1); + } }/*ocl*/ } /*cv*/ diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 285041ddda..8028ca5c7c 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -143,7 +143,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); - + cv::Mat dst(dst_a); a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE)) @@ -277,16 +277,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) blocky = size.height/TILE_SIZE; else blocky = size.height/TILE_SIZE + 1; - cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1); + cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1); cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double)); int tile_width = std::min(size.width,TILE_SIZE); int tile_height = std::min(size.height,TILE_SIZE); @@ -299,25 +290,17 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width )); args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.step )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky )); args.push_back( make_pair( sizeof(cl_int) , (void *)&type )); args.push_back( make_pair( sizeof(cl_int) , (void *)&depth )); args.push_back( make_pair( sizeof(cl_int) , (void *)&cn )); args.push_back( make_pair( sizeof(cl_int) , (void *)&coi )); args.push_back( make_pair( sizeof(cl_int) , (void *)&binary )); args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); size_t localThreadss[3] = { 128, 1, 1}; size_t globalThreadss[3] = { 128, 1, 1}; @@ -327,20 +310,12 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data )); - openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); + args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); + args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); + openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); double* dstsum = new double[10]; memset(dstsum,0,10*sizeof(double)); - openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double)); + openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double)); mom->m00 = dstsum[0]; mom->m10 = dstsum[1]; mom->m01 = dstsum[2]; @@ -351,6 +326,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) mom->m21 = dstsum[7]; mom->m12 = dstsum[8]; mom->m03 = dstsum[9]; + delete [] dstsum; icvCompleteMomentState( mom ); } diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 0730ac5ac7..e76fb1d21e 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -5,19 +5,93 @@ int bit1Count(float x) { int c = 0; int ix = (int)x; - for (int i = 0 ; i < 32 ; i++) { c += ix & 0x1; ix >>= 1; } - return (float)c; } + +float reduce_block(__local float *s_query, + __local float *s_train, + int block_size, + int lidx, + int lidy, + int distType + ) +{ + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + float result = 0; + switch(distType) + { + case 0: + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + break; + case 1: + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + break; + case 2: + for (int j = 0 ; j < block_size ; j++) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); + } + break; + } + return result; +} + +float reduce_multi_block(__local float *s_query, + __local float *s_train, + int max_desc_len, + int block_size, + int block_index, + int lidx, + int lidy, + int distType + ) +{ + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + float result = 0; + switch(distType) + { + case 0: + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]); + } + break; + case 1: + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + break; + case 2: + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + } + break; + } + return result; +} + /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size local size: dim0 is block_size, dim1 is block_size. */ -__kernel void BruteForceMatch_UnrollMatch( +__kernel void BruteForceMatch_UnrollMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch( __local float *s_train = sharebuffer + block_size * max_desc_len; int queryIdx = groupidx * block_size + lidy; - // load the query into local memory. for (int i = 0 ; i < max_desc_len / block_size; i ++) { @@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) { //load a block_size * block_size block into local train. @@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch( } barrier(CLK_LOCAL_MEM_FENCE); - __local float *s_distance = (__local float *)(sharebuffer); - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local float *s_distance = (__local float*)(sharebuffer); + __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); //find BestMatch s_distance += lidy * block_size; @@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch( } } -__kernel void BruteForceMatch_Match( +__kernel void BruteForceMatch_Match_D5( __global float *query, __global float *train, //__global float *mask, @@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match( { //Dist dist; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) { const int loadx = lidx + i * block_size; @@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match( barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match( } //radius_unrollmatch -__kernel void BruteForceMatch_RadiusUnrollMatch( +__kernel void BruteForceMatch_RadiusUnrollMatch_D5( __global float *query, __global float *train, float maxDistance, @@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( __local float *s_train = sharebuffer + block_size * block_size; float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; ++i) { //load a block_size * block_size block into local train. @@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( { unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - if (ind < bestTrainIdx_cols) + if(ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; @@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( } //radius_match -__kernel void BruteForceMatch_RadiusMatch( +__kernel void BruteForceMatch_RadiusMatch_D5( __global float *query, __global float *train, float maxDistance, @@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch( __local float *s_train = sharebuffer + block_size * block_size; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) { //load a block_size * block_size block into local train. @@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + unsigned int ind = atom_inc(nMatches + queryIdx); - if (ind < bestTrainIdx_cols) + if(ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; @@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch( } -__kernel void BruteForceMatch_knnUnrollMatch( +__kernel void BruteForceMatch_knnUnrollMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch( //loopUnrolledCached volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) { const int loadX = lidx + i * block_size; @@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch( for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; - if (val < bestDistance1) { bestDistance2 = bestDistance1; @@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( } } -__kernel void BruteForceMatch_knnMatch( +__kernel void BruteForceMatch_knnMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch( for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0.0f; - - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) { const int loadx = lidx + i * block_size; //load query and train into local memory @@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch( barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch( for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; - if (val < bestDistance1) { bestDistance2 = bestDistance1; @@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch( } } -kernel void BruteForceMatch_calcDistanceUnrolled( +kernel void BruteForceMatch_calcDistanceUnrolled_D5( __global float *query, __global float *train, //__global float *mask, @@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled( /* Todo */ } -kernel void BruteForceMatch_calcDistance( +kernel void BruteForceMatch_calcDistance_D5( __global float *query, __global float *train, //__global float *mask, @@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance( /* Todo */ } -kernel void BruteForceMatch_findBestMatch( +kernel void BruteForceMatch_findBestMatch_D5( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 2fa0906b41..9e468b07f5 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int4 data = *(__global int4*)&sum[glb_off]; int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); +#if OFF lcldata[lcl_off] = data.x; lcldata[lcl_off+1] = data.y; lcldata[lcl_off+2] = data.z; lcldata[lcl_off+3] = data.w; +#else + vstore4(data, 0, &lcldata[lcl_off]); +#endif } lcloutindex[lcl_id] = 0; @@ -559,3 +563,7 @@ if(result) } } */ + + + + diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 3133e62371..857f891c38 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -45,22 +45,28 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable +#if defined (DOUBLE_SUPPORT) -#elif defined (__NVIDIA__) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable #endif -#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__)) #define TYPE_IMAGE_SQSUM double #else -#define TYPE_IMAGE_SQSUM ulong +#define TYPE_IMAGE_SQSUM float +#endif + +#ifndef CN4 +#define CN4 1 +#else +#define CN4 4 #endif ////////////////////////////////////////////////// // utilities -#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox) +#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4) #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) // normAcc* are accurate normalization routines which make GPU matchTemplate // consistent with CPU one @@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum) __kernel void normalizeKernel_C1_D0 ( - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, __global float * res, ulong tpl_sqsum, int res_rows, @@ -119,8 +125,8 @@ void normalizeKernel_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); } } @@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; } } @@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0 __kernel void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 ( - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, __global float * res, ulong tpl_sqsum, int res_rows, @@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, - sqrt(image_sqsum_ * tpl_sqsum)); + sqrt(image_sqsum_ * tpl_sqsum)); } } @@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { float sum = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); res[res_idx] -= sum * tpl_sum; } } @@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0 { float ccorr = res[res_idx]; ccorr -= tpl_sum_c0*(float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c1*(float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c2*(float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c3*(float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); res[res_idx] = ccorr; } } @@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 __global const uint * img_sums, int img_sums_offset, int img_sums_step, - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, int img_sqsums_offset, int img_sqsums_step, float tpl_sum, @@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sum_ = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); } @@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 __global const uint * img_sums_c3, int img_sums_offset, int img_sums_step, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c0, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c1, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c2, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c3, + __global const float * img_sqsums_c0, + __global const float * img_sqsums_c1, + __global const float * img_sqsums_c2, + __global const float * img_sqsums_c3, int img_sqsums_offset, int img_sqsums_step, float tpl_sum_c0, @@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 if(gidx < res_cols && gidy < res_rows) { float image_sum_c0 = (float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); float image_sum_c1 = (float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); float image_sum_c2 = (float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); float image_sum_c3 = (float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); float image_sqsum_c0 = (float)( - (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); + (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); float image_sqsum_c1 = (float)( - (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); + (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); float image_sqsum_c2 = (float)( - (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); + (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); float image_sqsum_c3 = (float)( - (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); + (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); float num = res[res_idx] - - image_sum_c0 * tpl_sum_c0 - - image_sum_c1 * tpl_sum_c1 - - image_sum_c2 * tpl_sum_c2 - - image_sum_c3 * tpl_sum_c3; + image_sum_c0 * tpl_sum_c0 - + image_sum_c1 * tpl_sum_c1 - + image_sum_c2 * tpl_sum_c2 - + image_sum_c3 * tpl_sum_c3; float denum = sqrt( tpl_sqsum * ( - image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + - image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + - image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + - image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) - ); + image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + + image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + + image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + + image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) + ); res[res_idx] = normAcc(num, denum); } } + +////////////////////////////////////////////////////////////////////// +// extractFirstChannel +__kernel +void extractFirstChannel +( + const __global float4* img, + __global float* res, + int rows, + int cols, + int img_offset, + int res_offset, + int img_step, + int res_step +) +{ + img_step /= sizeof(float4); + res_step /= sizeof(float); + img_offset /= sizeof(float4); + res_offset /= sizeof(float); + img += img_offset; + res += res_offset; + int gidx = get_global_id(0); + int gidy = get_global_id(1); + if(gidx < cols && gidy < rows) + { + res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x; + } +} diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index 399ff32076..f8d6024e9f 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -6,25 +6,27 @@ #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif typedef double T; +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4 #else -typedef float double; -typedef float4 double4; +typedef float F; +typedef float4 F4; typedef long T; -#define convert_double4 convert_float4 +#define convert_F4 convert_float4 #endif -//#pragma OPENCL EXTENSION cl_amd_printf:enable -//#if defined (DOUBLE_SUPPORT) -#define DST_ROW_A00 0 -#define DST_ROW_A10 1 -#define DST_ROW_A01 2 -#define DST_ROW_A20 3 -#define DST_ROW_A11 4 -#define DST_ROW_A02 5 -#define DST_ROW_A30 6 -#define DST_ROW_A21 7 -#define DST_ROW_A12 8 -#define DST_ROW_A03 9 + +#define DST_ROW_00 0 +#define DST_ROW_10 1 +#define DST_ROW_01 2 +#define DST_ROW_20 3 +#define DST_ROW_11 4 +#define DST_ROW_02 5 +#define DST_ROW_30 6 +#define DST_ROW_21 7 +#define DST_ROW_12 8 +#define DST_ROW_03 9 __kernel void icvContourMoments(int contour_total, __global float* reader_oclmat_data, @@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total, yii_1 = yi_1 + yi; dst_step /= sizeof(T); - *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; - *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; - *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1; - *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); - *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); - *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); - *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); - *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); - *( dst_a + DST_ROW_A21 * dst_step + idx) = + *( dst_a + DST_ROW_00 * dst_step + idx) = dxy; + *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; + *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; + *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); + *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); + *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); + *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); + *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); + *( dst_a + DST_ROW_21 * dst_step + idx) = dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + xi2 * (yi_1 + 3 * yi)); - *( dst_a + DST_ROW_A12 * dst_step + idx) = + *( dst_a + DST_ROW_12 * dst_step + idx) = dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + yi2 * (xi_1 + 3 * xi)); } -//#endif -//#if defined (DOUBLE_SUPPORT) +__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, + __global F* sum, __global F* dst_m, int dst_step) +{ + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int block_y = src_rows/tile_height; + int block_x = src_cols/tile_width; + int block_num; + + if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) + block_y ++; + if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) + block_x ++; + block_num = block_y * block_x; + __local F dst_sum[10][128]; + if(gidy<128-block_num) + for(int i=0; i<10; i++) + dst_sum[i][gidy+block_num]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + dst_step /= sizeof(F); + if(gidy0; lsize>>=1) + { + if(gidy TILE_SIZE && src_rows % TILE_SIZE != 0) - block_y ++; - if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) - block_x ++; - block_num = block_y * block_x; - __local double dst_sum[10][128]; - if(gidy<128-block_num) - for(int i=0; i<10; i++) - dst_sum[i][gidy+block_num]=0; - barrier(CLK_LOCAL_MEM_FENCE); - if(gidy0; lsize>>=1) - { - if(gidy= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) { - m[9][lidy-bheight] = ((double)py) * sy; // m03 - m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 m[6][lidy-bheight] = x3.s0; // m30 m[5][lidy-bheight] = x0.s0 * sy; // m02 m[4][lidy-bheight] = x1.s0 * lidy; // m11 @@ -714,11 +672,12 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols m[1][lidy-bheight] = x1.s0; // m10 m[0][lidy-bheight] = x0.s0; // m00 } + else if(lidy < bheight) { - lm[9] = ((double)py) * sy; // m03 - lm[8] = ((double)x1.s0) * sy; // m12 - lm[7] = ((double)x2.s0) * lidy; // m21 + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 lm[6] = x3.s0; // m30 lm[5] = x0.s0 * sy; // m02 lm[4] = x1.s0 * lidy; // m11 @@ -741,69 +700,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols } if(lidy == 0&&lidx == 0) { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (double)lm[mt]; - + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; if(binary) { - double s = 1./255; + F s = 1./255; for( int mt = 0; mt < 10; mt++ ) mom[mt] *= s; } - double xm = x * mom[0], ym = y * mom[0]; + F xm = x * mom[0], ym = y * mom[0]; // accumulate moments computed in each tile + dst_step /= sizeof(F); // + m00 ( = m00' ) - dst_m00[wgidy*dst_cols+wgidx]= mom[0]; + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; // + m10 ( = m10' + x*m00' ) - dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; // + m01 ( = m01' + y*m00' ) - dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym); + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - }*/ + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } } -//#endif -//#if defined (DOUBLE_SUPPORT) -__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, - __global double* dst_m00, - __global double* dst_m10, - __global double* dst_m01, - __global double* dst_m20, - __global double* dst_m11, - __global double* dst_m02, - __global double* dst_m30, - __global double* dst_m21, - __global double* dst_m12, - __global double* dst_m03, - int dst_cols, int dst_step, + +__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) { - double tmp_coi[4]; // get the coi data - double4 tmp[64]; + F tmp_coi[4]; // get the coi data + F4 tmp[64]; int VLEN_D = 4; // length of vetor int gidy = get_global_id(0); int gidx = get_global_id(1); @@ -820,39 +769,39 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col if(tileSize_width < TILE_SIZE) for(int i = tileSize_width; i < rstep; i++ ) - *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0; + *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; if( coi > 0 ) for(int i=0; i < tileSize_width; i+=VLEN_D) { for(int j=0; j<4; j++) tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); - tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); + tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); } else for(int i=0; i < tileSize_width; i+=VLEN_D) - tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); - double4 zero = (double4)(0); - double4 full = (double4)(255); + tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); + F4 zero = (F4)(0); + F4 full = (F4)(255); if( binary ) for(int i=0; i < tileSize_width; i+=VLEN_D) tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; - double mom[10]; - __local double m[10][128]; + F mom[10]; + __local F m[10][128]; if(lidy == 0) for(int i=0; i<10; i++) for(int j=0; j<128; j++) m[i][j]=0; barrier(CLK_LOCAL_MEM_FENCE); - double lm[10] = {0}; - double4 x0 = (double4)(0); - double4 x1 = (double4)(0); - double4 x2 = (double4)(0); - double4 x3 = (double4)(0); + F lm[10] = {0}; + F4 x0 = (F4)(0); + F4 x1 = (F4)(0); + F4 x2 = (F4)(0); + F4 x3 = (F4)(0); for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D ) { - double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3); - double4 p = tmp[xt/VLEN_D]; - double4 xp = v_xt * p, xxp = xp * v_xt; + F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); + F4 p = tmp[xt/VLEN_D]; + F4 xp = v_xt * p, xxp = xp * v_xt; x0 += p; x1 += xp; x2 += xxp; @@ -863,13 +812,13 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col x2.s0 += x2.s1 + x2.s2 + x2.s3; x3.s0 += x3.s1 + x3.s2 + x3.s3; - double py = lidy * x0.s0, sy = lidy*lidy; + F py = lidy * x0.s0, sy = lidy*lidy; int bheight = min(tileSize_height, TILE_SIZE/2); if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) { - m[9][lidy-bheight] = ((double)py) * sy; // m03 - m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 m[6][lidy-bheight] = x3.s0; // m30 m[5][lidy-bheight] = x0.s0 * sy; // m02 m[4][lidy-bheight] = x1.s0 * lidy; // m11 @@ -881,9 +830,9 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col else if(lidy < bheight) { - lm[9] = ((double)py) * sy; // m03 - lm[8] = ((double)x1.s0) * sy; // m12 - lm[7] = ((double)x2.s0) * lidy; // m21 + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 lm[6] = x3.s0; // m30 lm[5] = x0.s0 * sy; // m02 lm[4] = x1.s0 * lidy; // m11 @@ -907,47 +856,47 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col if(lidy == 0&&lidx == 0) { for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (double)lm[mt]; + mom[mt] = (F)lm[mt]; if(binary) { - double s = 1./255; + F s = 1./255; for( int mt = 0; mt < 10; mt++ ) mom[mt] *= s; } - double xm = x * mom[0], ym = y * mom[0]; + F xm = x * mom[0], ym = y * mom[0]; // accumulate moments computed in each tile + dst_step /= sizeof(F); // + m00 ( = m00' ) - dst_m00[wgidy*dst_cols+wgidx] = mom[0]; + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; // + m10 ( = m10' + x*m00' ) - dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; // + m01 ( = m01' + y*m00' ) - dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); } -} -//#endif +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl index d603ad6bce..0b7f0c9025 100644 --- a/modules/ocl/src/opencl/pyr_up.cl +++ b/modules/ocl/src/opencl/pyr_up.cl @@ -16,6 +16,8 @@ // // @Authors // Zhang Chunpeng chunpeng@multicorewareinc.com +// Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, yao@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -53,20 +55,22 @@ uchar get_valid_uchar(uchar data) ////////////////////////// CV_8UC1 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - __local float s_srcPatch[10][10]; __local float s_dstPatch[20][16]; + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1; + int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -74,25 +78,24 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float sum = 0; - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); if(eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; @@ -103,42 +106,40 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } - if (get_local_id(1) > 13) - { + if (get_local_id(1) > 13) + { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); sum = 0; - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = (float)(4.0f * sum); @@ -149,8 +150,8 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, ////////////////////////// CV_16UC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -210,13 +211,13 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } - if (get_local_id(1) > 13) - { + if (get_local_id(1) > 13) + { sum = 0; if (eveny) @@ -228,7 +229,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + } barrier(CLK_LOCAL_MEM_FENCE); @@ -251,12 +252,15 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, ////////////////////////// CV_32FC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float s_srcPatch[10][10]; __local float s_dstPatch[20][16]; @@ -266,10 +270,10 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, dstStep = dstStep >> 2; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1; + int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -277,71 +281,67 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float sum = 0; - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); + if(eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[tidy][tidx] = sum; + } - if (get_local_id(1) > 13) - { + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = (float)(4.0f * sum); @@ -376,37 +376,16 @@ uchar4 convert_float4_to_uchar4(float4 data) return u4Data; } -float4 int_x_float4(int leftOpr,float4 rightOpr) -{ - float4 result = {0,0,0,0}; - - result.x = rightOpr.x * leftOpr; - result.y = rightOpr.y * leftOpr; - result.z = rightOpr.z * leftOpr; - result.w = rightOpr.w * leftOpr; - - return result; -} - -float4 float4_x_float4(float4 leftOpr,float4 rightOpr) -{ - float4 result; - - result.x = leftOpr.x * rightOpr.x; - result.y = leftOpr.y * rightOpr.y; - result.z = leftOpr.z * rightOpr.z; - result.w = leftOpr.w * rightOpr.w; - - return result; -} - __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float4 s_srcPatch[10][10]; __local float4 s_dstPatch[20][16]; @@ -416,10 +395,10 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, dstStep >>= 2; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1; + int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -427,17 +406,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float4 sum = (float4)(0,0,0,0); - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); @@ -446,63 +424,59 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[tidy][tidx] = sum; + } + + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum)); + dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum); } } /////////////////////////////////////////////////////////////////////// @@ -535,8 +509,8 @@ ushort4 convert_float4_to_ushort4(float4 data) __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -580,11 +554,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; } @@ -596,31 +570,31 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + } barrier(CLK_LOCAL_MEM_FENCE); @@ -628,15 +602,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, const int tidy = get_local_id(1); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum)); + dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum); } } @@ -644,12 +618,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, ////////////////////////// CV_32FC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float4 s_srcPatch[10][10]; __local float4 s_dstPatch[20][16]; @@ -659,10 +636,10 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, dstStep >>= 4; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -670,17 +647,16 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float4 sum = (float4)(0,0,0,0); - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); @@ -689,59 +665,55 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[tidy][tidx] = sum; + } + + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) { diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 954283987b..99177c7bd0 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -323,7 +323,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols) float conv = 0; int y1 = y==0? 0 : y-1; int x1 = x==0? 0 : x-1; - if(x < cols && y < rows) + if(x < cols && y < rows && x > 0 && y > 0) { conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) + (float)input[(y) * cols + (x1)] * (-2) + (float)input[(y) * cols + (x+1)] * (2) + diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp index bdf1f8a4af..424781fe0a 100644 --- a/modules/ocl/test/test_brute_force_matcher.cpp +++ b/modules/ocl/test/test_brute_force_matcher.cpp @@ -110,7 +110,7 @@ namespace } }; - TEST_P(BruteForceMatcher, DISABLED_Match_Single) + TEST_P(BruteForceMatcher, Match_Single) { cv::ocl::BruteForceMatcher_OCL_base matcher(distType); @@ -130,7 +130,7 @@ namespace ASSERT_EQ(0, badCount); } - TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single) + TEST_P(BruteForceMatcher, KnnMatch_2_Single) { const int knn = 2; diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp index 2fc6a10f5a..5da7f01cd8 100644 --- a/modules/ocl/test/test_match_template.cpp +++ b/modules/ocl/test/test_match_template.cpp @@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho } }; -TEST_P(MatchTemplate8U, DISABLED_Accuracy) +TEST_P(MatchTemplate8U, Accuracy) { std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl; @@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy) EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U, testing::Combine( MTEMP_SIZES, - testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))), testing::Values(Channels(1), Channels(3), Channels(4)), ALL_TEMPLATE_METHODS ) ); -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine( MTEMP_SIZES, - testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))), testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); #endif