mirror of https://github.com/opencv/opencv.git
commit
b6365699ee
135 changed files with 3043 additions and 3176 deletions
@ -1,154 +1,104 @@ |
||||
if(APPLE) |
||||
set(OPENCL_FOUND YES) |
||||
set(OPENCL_LIBRARIES "-framework OpenCL") |
||||
else() |
||||
set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library") |
||||
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory") |
||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) |
||||
else(APPLE) |
||||
find_package(OpenCL QUIET) |
||||
if(WITH_OPENCLAMDFFT) |
||||
set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH}) |
||||
if(NOT CLAMDFFT_SEARCH_PATH) |
||||
if(WIN32) |
||||
set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" ) |
||||
endif() |
||||
endif() |
||||
set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include ) |
||||
if(UNIX) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib) |
||||
else() |
||||
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64) |
||||
endif() |
||||
else() |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import) |
||||
else() |
||||
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import) |
||||
endif() |
||||
|
||||
if (NOT OPENCL_FOUND) |
||||
find_path(OPENCL_ROOT_DIR |
||||
NAMES OpenCL/cl.h CL/cl.h include/CL/cl.h include/nvidia-current/CL/cl.h |
||||
PATHS ENV OCLROOT ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT |
||||
DOC "OpenCL root directory" |
||||
NO_DEFAULT_PATH) |
||||
|
||||
find_path(OPENCL_INCLUDE_DIR |
||||
NAMES OpenCL/cl.h CL/cl.h |
||||
HINTS ${OPENCL_ROOT_DIR} |
||||
PATH_SUFFIXES include include/nvidia-current |
||||
DOC "OpenCL include directory") |
||||
|
||||
if (X86_64) |
||||
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64) |
||||
elseif (X86) |
||||
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86) |
||||
endif() |
||||
|
||||
find_library(OPENCL_LIBRARY |
||||
NAMES OpenCL |
||||
HINTS ${OPENCL_ROOT_DIR} |
||||
PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES} |
||||
DOC "OpenCL library") |
||||
|
||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) |
||||
include(FindPackageHandleStandardArgs) |
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR ) |
||||
endif() |
||||
endif(APPLE) |
||||
|
||||
if(OPENCL_FOUND) |
||||
set(HAVE_OPENCL 1) |
||||
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR}) |
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) |
||||
|
||||
if (X86_64) |
||||
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) |
||||
elseif (X86) |
||||
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) |
||||
endif() |
||||
|
||||
if(WITH_OPENCLAMDFFT) |
||||
find_path(CLAMDFFT_ROOT_DIR |
||||
NAMES include/clAmdFft.h |
||||
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles |
||||
PATH_SUFFIXES clAmdFft AMD/clAmdFft |
||||
DOC "AMD FFT root directory" |
||||
NO_DEFAULT_PATH) |
||||
|
||||
find_path(CLAMDFFT_INCLUDE_DIR |
||||
NAMES clAmdFft.h |
||||
PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH} |
||||
PATH_SUFFIXES clAmdFft |
||||
NO_DEFAULT_PATH) |
||||
NAMES clAmdFft.h |
||||
HINTS ${CLAMDFFT_ROOT_DIR} |
||||
PATH_SUFFIXES include |
||||
DOC "clAmdFft include directory") |
||||
|
||||
find_library(CLAMDFFT_LIBRARY |
||||
NAMES clAmdFft.Runtime |
||||
PATHS ${CLAMDFFT_LIB_SEARCH_PATH} |
||||
NO_DEFAULT_PATH) |
||||
if(CLAMDFFT_LIBRARY) |
||||
set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY}) |
||||
else() |
||||
set(CLAMDFFT_LIBRARIES "") |
||||
endif() |
||||
endif() |
||||
if(WITH_OPENCLAMDBLAS) |
||||
set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH}) |
||||
if(NOT CLAMDBLAS_SEARCH_PATH) |
||||
if(WIN32) |
||||
set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" ) |
||||
endif() |
||||
endif() |
||||
set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include ) |
||||
if(UNIX) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib) |
||||
else() |
||||
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64) |
||||
endif() |
||||
else() |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import) |
||||
else() |
||||
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import) |
||||
endif() |
||||
endif() |
||||
find_path(CLAMDBLAS_INCLUDE_DIR |
||||
NAMES clAmdBlas.h |
||||
PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH} |
||||
PATH_SUFFIXES clAmdBlas |
||||
NO_DEFAULT_PATH) |
||||
find_library(CLAMDBLAS_LIBRARY |
||||
NAMES clAmdBlas |
||||
PATHS ${CLAMDBLAS_LIB_SEARCH_PATH} |
||||
NO_DEFAULT_PATH) |
||||
if(CLAMDBLAS_LIBRARY) |
||||
set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY}) |
||||
else() |
||||
set(CLAMDBLAS_LIBRARIES "") |
||||
NAMES clAmdFft.Runtime |
||||
HINTS ${CLAMDFFT_ROOT_DIR} |
||||
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES} |
||||
DOC "clAmdFft library") |
||||
|
||||
if(CLAMDFFT_LIBRARY AND CLAMDFFT_INCLUDE_DIR) |
||||
set(HAVE_CLAMDFFT 1) |
||||
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDFFT_INCLUDE_DIR}") |
||||
list(APPEND OPENCL_LIBRARIES "${CLAMDFFT_LIBRARY}") |
||||
endif() |
||||
endif() |
||||
# Try AMD/ATI Stream SDK |
||||
if (NOT OPENCL_FOUND) |
||||
set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) |
||||
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT}) |
||||
set(ENV_OPENCLROOT $ENV{OPENCLROOT}) |
||||
set(ENV_CUDA_PATH $ENV{CUDA_PATH}) |
||||
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT}) |
||||
if(ENV_AMDSTREAMSDKROOT) |
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86) |
||||
else() |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86_64) |
||||
endif() |
||||
elseif(ENV_AMDSTREAMSDKROOT) |
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86) |
||||
else() |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64) |
||||
endif() |
||||
elseif(ENV_CUDA_PATH AND WIN32) |
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32) |
||||
else() |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64) |
||||
endif() |
||||
elseif(ENV_OPENCLROOT AND UNIX) |
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib) |
||||
else() |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64) |
||||
endif() |
||||
elseif(ENV_INTELOCLSDKROOT) |
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include) |
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4) |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86) |
||||
else() |
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64) |
||||
endif() |
||||
endif() |
||||
|
||||
if(OPENCL_INCLUDE_SEARCH_PATH) |
||||
find_path(OPENCL_INCLUDE_DIR |
||||
NAMES CL/cl.h OpenCL/cl.h |
||||
PATHS ${OPENCL_INCLUDE_SEARCH_PATH} |
||||
NO_DEFAULT_PATH) |
||||
else() |
||||
find_path(OPENCL_INCLUDE_DIR |
||||
NAMES CL/cl.h OpenCL/cl.h) |
||||
endif() |
||||
if(WITH_OPENCLAMDBLAS) |
||||
find_path(CLAMDBLAS_ROOT_DIR |
||||
NAMES include/clAmdBlas.h |
||||
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles |
||||
PATH_SUFFIXES clAmdBlas AMD/clAmdBlas |
||||
DOC "AMD FFT root directory" |
||||
NO_DEFAULT_PATH) |
||||
|
||||
if(OPENCL_LIB_SEARCH_PATH) |
||||
find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH) |
||||
else() |
||||
find_library(OPENCL_LIBRARY NAMES OpenCL) |
||||
endif() |
||||
find_path(CLAMDBLAS_INCLUDE_DIR |
||||
NAMES clAmdBlas.h |
||||
HINTS ${CLAMDBLAS_ROOT_DIR} |
||||
PATH_SUFFIXES include |
||||
DOC "clAmdFft include directory") |
||||
|
||||
include(FindPackageHandleStandardArgs) |
||||
find_package_handle_standard_args( |
||||
OPENCL |
||||
DEFAULT_MSG |
||||
OPENCL_LIBRARY OPENCL_INCLUDE_DIR |
||||
) |
||||
find_library(CLAMDBLAS_LIBRARY |
||||
NAMES clAmdBlas |
||||
HINTS ${CLAMDBLAS_ROOT_DIR} |
||||
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES} |
||||
DOC "clAmdBlas library") |
||||
|
||||
if(OPENCL_FOUND) |
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) |
||||
set(HAVE_OPENCL 1) |
||||
else() |
||||
set(OPENCL_LIBRARIES) |
||||
if(CLAMDBLAS_LIBRARY AND CLAMDBLAS_INCLUDE_DIR) |
||||
set(HAVE_CLAMDBLAS 1) |
||||
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDBLAS_INCLUDE_DIR}") |
||||
list(APPEND OPENCL_LIBRARIES "${CLAMDBLAS_LIBRARY}") |
||||
endif() |
||||
else() |
||||
set(HAVE_OPENCL 1) |
||||
endif() |
||||
endif() |
||||
|
@ -0,0 +1,124 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_NONFREE_OCL_HPP__ |
||||
#define __OPENCV_NONFREE_OCL_HPP__ |
||||
|
||||
#include "opencv2/ocl/ocl.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
//! Speeded up robust features, port from GPU module.
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS SURF_OCL |
||||
{ |
||||
public: |
||||
enum KeypointLayout |
||||
{ |
||||
X_ROW = 0, |
||||
Y_ROW, |
||||
LAPLACIAN_ROW, |
||||
OCTAVE_ROW, |
||||
SIZE_ROW, |
||||
ANGLE_ROW, |
||||
HESSIAN_ROW, |
||||
ROWS_COUNT |
||||
}; |
||||
|
||||
//! the default constructor
|
||||
SURF_OCL(); |
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4, |
||||
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false); |
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const; |
||||
//! upload host keypoints to device memory
|
||||
void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl); |
||||
//! download keypoints from device to host memory
|
||||
void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints); |
||||
//! download descriptors from device to host memory
|
||||
void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors); |
||||
//! finds the keypoints using fast hessian detector used in SURF
|
||||
//! supports CV_8UC1 images
|
||||
//! keypoints will have nFeature cols and 6 rows
|
||||
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
|
||||
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
|
||||
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
|
||||
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
|
||||
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints); |
||||
//! finds the keypoints and computes their descriptors.
|
||||
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors, |
||||
bool useProvidedKeypoints = false); |
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints); |
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors, |
||||
bool useProvidedKeypoints = false); |
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors, |
||||
bool useProvidedKeypoints = false); |
||||
|
||||
void releaseMemory(); |
||||
|
||||
// SURF parameters
|
||||
float hessianThreshold; |
||||
int nOctaves; |
||||
int nOctaveLayers; |
||||
bool extended; |
||||
bool upright; |
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio; |
||||
oclMat sum, mask1, maskSum, intBuffer; |
||||
oclMat det, trace; |
||||
oclMat maxPosBuffer; |
||||
|
||||
}; |
||||
} |
||||
} |
||||
|
||||
#endif //__OPENCV_NONFREE_OCL_HPP__
|
@ -1,3 +1,4 @@ |
||||
#include "perf_precomp.hpp" |
||||
#include "opencv2/ts/gpu_perf.hpp" |
||||
|
||||
CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo()) |
||||
|
@ -0,0 +1,130 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_OCL_PRIVATE_UTIL__ |
||||
#define __OPENCV_OCL_PRIVATE_UTIL__ |
||||
|
||||
#include "opencv2/ocl/ocl.hpp" |
||||
|
||||
#if defined __APPLE__ |
||||
#include <OpenCL/OpenCL.h> |
||||
#else |
||||
#include <CL/opencl.h> |
||||
#endif |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
enum openCLMemcpyKind |
||||
{ |
||||
clMemcpyHostToDevice = 0, |
||||
clMemcpyDeviceToHost, |
||||
clMemcpyDeviceToDevice |
||||
}; |
||||
///////////////////////////OpenCL call wrappers////////////////////////////
|
||||
void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch, |
||||
size_t widthInBytes, size_t height); |
||||
void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch, |
||||
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type); |
||||
void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, |
||||
const void *src, size_t spitch, |
||||
size_t width, size_t height, openCLMemcpyKind kind, int channels = -1); |
||||
void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, |
||||
const void *src, size_t spitch, |
||||
size_t width, size_t height, int src_offset); |
||||
void CV_EXPORTS openCLFree(void *devPtr); |
||||
cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); |
||||
void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); |
||||
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, |
||||
const char **source, std::string kernelName); |
||||
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, |
||||
const char **source, std::string kernelName, const char *build_options); |
||||
void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); |
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args, |
||||
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); |
||||
void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName, |
||||
size_t globalThreads[3], size_t localThreads[3], |
||||
std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options); |
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth); |
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, |
||||
int depth, const char *build_options); |
||||
|
||||
cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value, |
||||
const size_t size); |
||||
|
||||
cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); |
||||
|
||||
int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName); |
||||
|
||||
enum FLUSH_MODE |
||||
{ |
||||
CLFINISH = 0, |
||||
CLFLUSH, |
||||
DISABLE |
||||
}; |
||||
|
||||
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE); |
||||
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, |
||||
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE); |
||||
// bind oclMat to OpenCL image textures
|
||||
// note:
|
||||
// 1. there is no memory management. User need to explicitly release the resource
|
||||
// 2. for faster clamping, there is no buffer padding for the constructed texture
|
||||
cl_mem CV_EXPORTS bindTexture(const oclMat &mat); |
||||
void CV_EXPORTS releaseTexture(cl_mem& texture); |
||||
|
||||
// returns whether the current context supports image2d_t format or not
|
||||
bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext()); |
||||
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
|
||||
#endif //__OPENCV_OCL_PRIVATE_UTIL__
|
@ -1,865 +0,0 @@ |
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable |
||||
#define MAX_FLOAT 1e7f |
||||
|
||||
int bit1Count(float x) |
||||
{ |
||||
int c = 0; |
||||
int ix = (int)x; |
||||
|
||||
for (int i = 0 ; i < 32 ; i++) |
||||
{ |
||||
c += ix & 0x1; |
||||
ix >>= 1; |
||||
} |
||||
|
||||
return (float)c; |
||||
} |
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size |
||||
local size: dim0 is block_size, dim1 is block_size. |
||||
*/ |
||||
__kernel void BruteForceMatch_UnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * max_desc_len; |
||||
|
||||
int queryIdx = groupidx * block_size + lidy; |
||||
|
||||
// load the query into local memory. |
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++) |
||||
{ |
||||
int loadx = lidx + i * block_size; |
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
} |
||||
|
||||
float myBestDistance = MAX_FLOAT; |
||||
int myBestTrainIdx = -1; |
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance. |
||||
volatile int imgIdx = 0; |
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
myBestDistance = result; |
||||
myBestTrainIdx = trainIdx; |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
__local float *s_distance = (__local float *)(sharebuffer); |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//find BestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
s_distance[lidx] = myBestDistance; |
||||
s_trainIdx[lidx] = myBestTrainIdx; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//reduce -- now all reduce implement in each threads. |
||||
for (int k = 0 ; k < block_size; k++) |
||||
{ |
||||
if (myBestDistance > s_distance[k]) |
||||
{ |
||||
myBestDistance = s_distance[k]; |
||||
myBestTrainIdx = s_trainIdx[k]; |
||||
} |
||||
} |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = myBestTrainIdx; |
||||
bestDistance[queryIdx] = myBestDistance; |
||||
} |
||||
} |
||||
|
||||
__kernel void BruteForceMatch_Match( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
|
||||
float myBestDistance = MAX_FLOAT; |
||||
int myBestTrainIdx = -1; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
// loop |
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
//Dist dist; |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) |
||||
{ |
||||
const int loadx = lidx + i * block_size; |
||||
//load query and train into local memory |
||||
s_query[lidy * block_size + lidx] = 0; |
||||
s_train[lidx * block_size + lidy] = 0; |
||||
|
||||
if (loadx < query_cols) |
||||
{ |
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
//myBestImgidx = imgIdx; |
||||
myBestDistance = result; |
||||
myBestTrainIdx = trainIdx; |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer; |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//findBestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
s_distance[lidx] = myBestDistance; |
||||
s_trainIdx[lidx] = myBestTrainIdx; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//reduce -- now all reduce implement in each threads. |
||||
for (int k = 0 ; k < block_size; k++) |
||||
{ |
||||
if (myBestDistance > s_distance[k]) |
||||
{ |
||||
myBestDistance = s_distance[k]; |
||||
myBestTrainIdx = s_trainIdx[k]; |
||||
} |
||||
} |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = myBestTrainIdx; |
||||
bestDistance[queryIdx] = myBestDistance; |
||||
} |
||||
} |
||||
|
||||
//radius_unrollmatch |
||||
__kernel void BruteForceMatch_RadiusUnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
float maxDistance, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__global int *nMatches, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int bestTrainIdx_cols, |
||||
int step, |
||||
int ostep, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
const int groupidy = get_group_id(1); |
||||
|
||||
const int queryIdx = groupidy * block_size + lidy; |
||||
const int trainIdx = groupidx * block_size + lidx; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); |
||||
|
||||
if (ind < bestTrainIdx_cols) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; |
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; |
||||
} |
||||
} |
||||
} |
||||
|
||||
//radius_match |
||||
__kernel void BruteForceMatch_RadiusMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
float maxDistance, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__global int *nMatches, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int bestTrainIdx_cols, |
||||
int step, |
||||
int ostep, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
const int groupidy = get_group_id(1); |
||||
|
||||
const int queryIdx = groupidy * block_size + lidy; |
||||
const int trainIdx = groupidx * block_size + lidx; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); |
||||
|
||||
if (ind < bestTrainIdx_cols) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; |
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int2 *bestTrainIdx, |
||||
__global float2 *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
local float *s_query = sharebuffer; |
||||
local float *s_train = sharebuffer + block_size * max_desc_len; |
||||
|
||||
// load the query into local memory. |
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++) |
||||
{ |
||||
int loadx = lidx + i * block_size; |
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
} |
||||
|
||||
float myBestDistance1 = MAX_FLOAT; |
||||
float myBestDistance2 = MAX_FLOAT; |
||||
int myBestTrainIdx1 = -1; |
||||
int myBestTrainIdx2 = -1; |
||||
|
||||
//loopUnrolledCached |
||||
volatile int imgIdx = 0; |
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++) |
||||
{ |
||||
const int loadX = lidx + i * block_size; |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows) |
||||
{ |
||||
if (result < myBestDistance1) |
||||
{ |
||||
myBestDistance2 = myBestDistance1; |
||||
myBestTrainIdx2 = myBestTrainIdx1; |
||||
myBestDistance1 = result; |
||||
myBestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (result < myBestDistance2) |
||||
{ |
||||
myBestDistance2 = result; |
||||
myBestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
local float *s_distance = (local float *)sharebuffer; |
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
// find BestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
|
||||
s_distance[lidx] = myBestDistance1; |
||||
s_trainIdx[lidx] = myBestTrainIdx1; |
||||
|
||||
float bestDistance1 = MAX_FLOAT; |
||||
float bestDistance2 = MAX_FLOAT; |
||||
int bestTrainIdx1 = -1; |
||||
int bestTrainIdx2 = -1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance1) |
||||
{ |
||||
bestDistance2 = bestDistance1; |
||||
bestTrainIdx2 = bestTrainIdx1; |
||||
|
||||
bestDistance1 = val; |
||||
bestTrainIdx1 = s_trainIdx[i]; |
||||
} |
||||
else if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
s_distance[lidx] = myBestDistance2; |
||||
s_trainIdx[lidx] = myBestTrainIdx2; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
myBestDistance1 = bestDistance1; |
||||
myBestDistance2 = bestDistance2; |
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1; |
||||
myBestTrainIdx2 = bestTrainIdx2; |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); |
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); |
||||
} |
||||
} |
||||
|
||||
__kernel void BruteForceMatch_knnMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int2 *bestTrainIdx, |
||||
__global float2 *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
local float *s_query = sharebuffer; |
||||
local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float myBestDistance1 = MAX_FLOAT; |
||||
float myBestDistance2 = MAX_FLOAT; |
||||
int myBestTrainIdx1 = -1; |
||||
int myBestTrainIdx2 = -1; |
||||
|
||||
//loop |
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0.0f; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) |
||||
{ |
||||
const int loadx = lidx + i * block_size; |
||||
//load query and train into local memory |
||||
s_query[lidy * block_size + lidx] = 0; |
||||
s_train[lidx * block_size + lidy] = 0; |
||||
|
||||
if (loadx < query_cols) |
||||
{ |
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
if (result < myBestDistance1) |
||||
{ |
||||
myBestDistance2 = myBestDistance1; |
||||
myBestTrainIdx2 = myBestTrainIdx1; |
||||
myBestDistance1 = result; |
||||
myBestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (result < myBestDistance2) |
||||
{ |
||||
myBestDistance2 = result; |
||||
myBestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer; |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//findBestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
|
||||
s_distance[lidx] = myBestDistance1; |
||||
s_trainIdx[lidx] = myBestTrainIdx1; |
||||
|
||||
float bestDistance1 = MAX_FLOAT; |
||||
float bestDistance2 = MAX_FLOAT; |
||||
int bestTrainIdx1 = -1; |
||||
int bestTrainIdx2 = -1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance1) |
||||
{ |
||||
bestDistance2 = bestDistance1; |
||||
bestTrainIdx2 = bestTrainIdx1; |
||||
|
||||
bestDistance1 = val; |
||||
bestTrainIdx1 = s_trainIdx[i]; |
||||
} |
||||
else if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
s_distance[lidx] = myBestDistance2; |
||||
s_trainIdx[lidx] = myBestTrainIdx2; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
myBestDistance1 = bestDistance1; |
||||
myBestDistance2 = bestDistance2; |
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1; |
||||
myBestTrainIdx2 = bestTrainIdx2; |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); |
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); |
||||
} |
||||
} |
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global float *allDist, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType) |
||||
{ |
||||
/* Todo */ |
||||
} |
||||
|
||||
kernel void BruteForceMatch_calcDistance( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global float *allDist, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType) |
||||
{ |
||||
/* Todo */ |
||||
} |
||||
|
||||
kernel void BruteForceMatch_findBestMatch( |
||||
__global float *allDist, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
int k, |
||||
int block_size |
||||
) |
||||
{ |
||||
/* Todo */ |
||||
} |
@ -1,81 +0,0 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef _OPENCV_MCWUTIL_ |
||||
#define _OPENCV_MCWUTIL_ |
||||
|
||||
#include "precomp.hpp" |
||||
using namespace std; |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
enum FLUSH_MODE |
||||
{ |
||||
CLFINISH = 0, |
||||
CLFLUSH, |
||||
DISABLE |
||||
}; |
||||
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE); |
||||
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], |
||||
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, |
||||
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE); |
||||
// bind oclMat to OpenCL image textures
|
||||
// note:
|
||||
// 1. there is no memory management. User need to explicitly release the resource
|
||||
// 2. for faster clamping, there is no buffer padding for the constructed texture
|
||||
cl_mem bindTexture(const oclMat &mat); |
||||
void releaseTexture(cl_mem& texture); |
||||
|
||||
// returns whether the current context supports image2d_t format or not
|
||||
bool support_image2d(Context *clCxt = Context::getContext()); |
||||
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
|
||||
#endif //_OPENCV_MCWUTIL_
|
@ -0,0 +1,865 @@ |
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable |
||||
#define MAX_FLOAT 1e7f |
||||
|
||||
int bit1Count(float x) |
||||
{ |
||||
int c = 0; |
||||
int ix = (int)x; |
||||
|
||||
for (int i = 0 ; i < 32 ; i++) |
||||
{ |
||||
c += ix & 0x1; |
||||
ix >>= 1; |
||||
} |
||||
|
||||
return (float)c; |
||||
} |
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size |
||||
local size: dim0 is block_size, dim1 is block_size. |
||||
*/ |
||||
__kernel void BruteForceMatch_UnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * max_desc_len; |
||||
|
||||
int queryIdx = groupidx * block_size + lidy; |
||||
|
||||
// load the query into local memory. |
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++) |
||||
{ |
||||
int loadx = lidx + i * block_size; |
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
} |
||||
|
||||
float myBestDistance = MAX_FLOAT; |
||||
int myBestTrainIdx = -1; |
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance. |
||||
volatile int imgIdx = 0; |
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
myBestDistance = result; |
||||
myBestTrainIdx = trainIdx; |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
__local float *s_distance = (__local float *)(sharebuffer); |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//find BestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
s_distance[lidx] = myBestDistance; |
||||
s_trainIdx[lidx] = myBestTrainIdx; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//reduce -- now all reduce implement in each threads. |
||||
for (int k = 0 ; k < block_size; k++) |
||||
{ |
||||
if (myBestDistance > s_distance[k]) |
||||
{ |
||||
myBestDistance = s_distance[k]; |
||||
myBestTrainIdx = s_trainIdx[k]; |
||||
} |
||||
} |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = myBestTrainIdx; |
||||
bestDistance[queryIdx] = myBestDistance; |
||||
} |
||||
} |
||||
|
||||
__kernel void BruteForceMatch_Match( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
|
||||
float myBestDistance = MAX_FLOAT; |
||||
int myBestTrainIdx = -1; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
// loop |
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
//Dist dist; |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) |
||||
{ |
||||
const int loadx = lidx + i * block_size; |
||||
//load query and train into local memory |
||||
s_query[lidy * block_size + lidx] = 0; |
||||
s_train[lidx * block_size + lidy] = 0; |
||||
|
||||
if (loadx < query_cols) |
||||
{ |
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
//myBestImgidx = imgIdx; |
||||
myBestDistance = result; |
||||
myBestTrainIdx = trainIdx; |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer; |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//findBestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
s_distance[lidx] = myBestDistance; |
||||
s_trainIdx[lidx] = myBestTrainIdx; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//reduce -- now all reduce implement in each threads. |
||||
for (int k = 0 ; k < block_size; k++) |
||||
{ |
||||
if (myBestDistance > s_distance[k]) |
||||
{ |
||||
myBestDistance = s_distance[k]; |
||||
myBestTrainIdx = s_trainIdx[k]; |
||||
} |
||||
} |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = myBestTrainIdx; |
||||
bestDistance[queryIdx] = myBestDistance; |
||||
} |
||||
} |
||||
|
||||
//radius_unrollmatch |
||||
__kernel void BruteForceMatch_RadiusUnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
float maxDistance, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__global int *nMatches, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int bestTrainIdx_cols, |
||||
int step, |
||||
int ostep, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
const int groupidy = get_group_id(1); |
||||
|
||||
const int queryIdx = groupidy * block_size + lidy; |
||||
const int trainIdx = groupidx * block_size + lidx; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); |
||||
|
||||
if (ind < bestTrainIdx_cols) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; |
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; |
||||
} |
||||
} |
||||
} |
||||
|
||||
//radius_match |
||||
__kernel void BruteForceMatch_RadiusMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
float maxDistance, |
||||
//__global float *mask, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
__global int *nMatches, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int bestTrainIdx_cols, |
||||
int step, |
||||
int ostep, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
const int groupidy = get_group_id(1); |
||||
|
||||
const int queryIdx = groupidy * block_size + lidy; |
||||
const int trainIdx = groupidx * block_size + lidx; |
||||
|
||||
__local float *s_query = sharebuffer; |
||||
__local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) |
||||
{ |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; ++j) |
||||
{ |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); |
||||
|
||||
if (ind < bestTrainIdx_cols) |
||||
{ |
||||
//bestImgIdx = imgIdx; |
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; |
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int2 *bestTrainIdx, |
||||
__global float2 *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
local float *s_query = sharebuffer; |
||||
local float *s_train = sharebuffer + block_size * max_desc_len; |
||||
|
||||
// load the query into local memory. |
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++) |
||||
{ |
||||
int loadx = lidx + i * block_size; |
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
} |
||||
|
||||
float myBestDistance1 = MAX_FLOAT; |
||||
float myBestDistance2 = MAX_FLOAT; |
||||
int myBestTrainIdx1 = -1; |
||||
int myBestTrainIdx2 = -1; |
||||
|
||||
//loopUnrolledCached |
||||
volatile int imgIdx = 0; |
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0; |
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++) |
||||
{ |
||||
const int loadX = lidx + i * block_size; |
||||
//load a block_size * block_size block into local train. |
||||
const int loadx = lidx + i * block_size; |
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; |
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already. |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows) |
||||
{ |
||||
if (result < myBestDistance1) |
||||
{ |
||||
myBestDistance2 = myBestDistance1; |
||||
myBestTrainIdx2 = myBestTrainIdx1; |
||||
myBestDistance1 = result; |
||||
myBestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (result < myBestDistance2) |
||||
{ |
||||
myBestDistance2 = result; |
||||
myBestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
local float *s_distance = (local float *)sharebuffer; |
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
// find BestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
|
||||
s_distance[lidx] = myBestDistance1; |
||||
s_trainIdx[lidx] = myBestTrainIdx1; |
||||
|
||||
float bestDistance1 = MAX_FLOAT; |
||||
float bestDistance2 = MAX_FLOAT; |
||||
int bestTrainIdx1 = -1; |
||||
int bestTrainIdx2 = -1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance1) |
||||
{ |
||||
bestDistance2 = bestDistance1; |
||||
bestTrainIdx2 = bestTrainIdx1; |
||||
|
||||
bestDistance1 = val; |
||||
bestTrainIdx1 = s_trainIdx[i]; |
||||
} |
||||
else if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
s_distance[lidx] = myBestDistance2; |
||||
s_trainIdx[lidx] = myBestTrainIdx2; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
myBestDistance1 = bestDistance1; |
||||
myBestDistance2 = bestDistance2; |
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1; |
||||
myBestTrainIdx2 = bestTrainIdx2; |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); |
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); |
||||
} |
||||
} |
||||
|
||||
__kernel void BruteForceMatch_knnMatch( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global int2 *bestTrainIdx, |
||||
__global float2 *bestDistance, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType |
||||
) |
||||
{ |
||||
const int lidx = get_local_id(0); |
||||
const int lidy = get_local_id(1); |
||||
const int groupidx = get_group_id(0); |
||||
|
||||
const int queryIdx = groupidx * block_size + lidy; |
||||
local float *s_query = sharebuffer; |
||||
local float *s_train = sharebuffer + block_size * block_size; |
||||
|
||||
float myBestDistance1 = MAX_FLOAT; |
||||
float myBestDistance2 = MAX_FLOAT; |
||||
int myBestTrainIdx1 = -1; |
||||
int myBestTrainIdx2 = -1; |
||||
|
||||
//loop |
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) |
||||
{ |
||||
float result = 0.0f; |
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) |
||||
{ |
||||
const int loadx = lidx + i * block_size; |
||||
//load query and train into local memory |
||||
s_query[lidy * block_size + lidx] = 0; |
||||
s_train[lidx * block_size + lidy] = 0; |
||||
|
||||
if (loadx < query_cols) |
||||
{ |
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to |
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ |
||||
|
||||
switch (distType) |
||||
{ |
||||
case 0: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
case 1: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; |
||||
result += qr * qr; |
||||
} |
||||
|
||||
break; |
||||
case 2: |
||||
|
||||
for (int j = 0 ; j < block_size ; j++) |
||||
{ |
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); |
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); |
||||
} |
||||
|
||||
break; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
const int trainIdx = t * block_size + lidx; |
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) |
||||
{ |
||||
if (result < myBestDistance1) |
||||
{ |
||||
myBestDistance2 = myBestDistance1; |
||||
myBestTrainIdx2 = myBestTrainIdx1; |
||||
myBestDistance1 = result; |
||||
myBestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (result < myBestDistance2) |
||||
{ |
||||
myBestDistance2 = result; |
||||
myBestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer; |
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); |
||||
|
||||
//findBestMatch |
||||
s_distance += lidy * block_size; |
||||
s_trainIdx += lidy * block_size; |
||||
|
||||
s_distance[lidx] = myBestDistance1; |
||||
s_trainIdx[lidx] = myBestTrainIdx1; |
||||
|
||||
float bestDistance1 = MAX_FLOAT; |
||||
float bestDistance2 = MAX_FLOAT; |
||||
int bestTrainIdx1 = -1; |
||||
int bestTrainIdx2 = -1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance1) |
||||
{ |
||||
bestDistance2 = bestDistance1; |
||||
bestTrainIdx2 = bestTrainIdx1; |
||||
|
||||
bestDistance1 = val; |
||||
bestTrainIdx1 = s_trainIdx[i]; |
||||
} |
||||
else if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
s_distance[lidx] = myBestDistance2; |
||||
s_trainIdx[lidx] = myBestTrainIdx2; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
{ |
||||
for (int i = 0 ; i < block_size ; i++) |
||||
{ |
||||
float val = s_distance[i]; |
||||
|
||||
if (val < bestDistance2) |
||||
{ |
||||
bestDistance2 = val; |
||||
bestTrainIdx2 = s_trainIdx[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
myBestDistance1 = bestDistance1; |
||||
myBestDistance2 = bestDistance2; |
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1; |
||||
myBestTrainIdx2 = bestTrainIdx2; |
||||
|
||||
if (queryIdx < query_rows && lidx == 0) |
||||
{ |
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); |
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); |
||||
} |
||||
} |
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global float *allDist, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int max_desc_len, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType) |
||||
{ |
||||
/* Todo */ |
||||
} |
||||
|
||||
kernel void BruteForceMatch_calcDistance( |
||||
__global float *query, |
||||
__global float *train, |
||||
//__global float *mask, |
||||
__global float *allDist, |
||||
__local float *sharebuffer, |
||||
int block_size, |
||||
int query_rows, |
||||
int query_cols, |
||||
int train_rows, |
||||
int train_cols, |
||||
int step, |
||||
int distType) |
||||
{ |
||||
/* Todo */ |
||||
} |
||||
|
||||
kernel void BruteForceMatch_findBestMatch( |
||||
__global float *allDist, |
||||
__global int *bestTrainIdx, |
||||
__global float *bestDistance, |
||||
int k, |
||||
int block_size |
||||
) |
||||
{ |
||||
/* Todo */ |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue